From 4074c995d2a88684fd4a9d1aa36d51de56bb8dab Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 30 Jan 2018 18:13:46 +0000 Subject: COMPMID-873: Integrate RSH NEON Depthwise Convolution routine Change-Id: Ida1e9a836bc518bfe5563e16bf7f92bde5fc13f7 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118472 Tested-by: Jenkins Reviewed-by: Pablo Tello --- .../winograd/transforms/input_2x2_3x3_fp32.cpp | 409 +++++++++++++++++ .../winograd/transforms/input_2x2_5x5_fp32.cpp | 458 +++++++++++++++++++ .../winograd/transforms/input_4x4_3x3_fp32.cpp | 486 +++++++++++++++++++++ .../winograd/transforms/output_2x2_3x3_fp32.cpp | 251 +++++++++++ .../winograd/transforms/output_2x2_5x5_fp32.cpp | 242 ++++++++++ .../winograd/transforms/output_4x4_3x3_fp32.cpp | 306 +++++++++++++ .../winograd/transforms/weights_2x2_3x3_fp32.cpp | 228 ++++++++++ .../winograd/transforms/weights_2x2_5x5_fp32.cpp | 408 +++++++++++++++++ .../winograd/transforms/weights_4x4_3x3_fp32.cpp | 266 +++++++++++ 9 files changed, 3054 insertions(+) create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp create mode 100644 src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp (limited to 'src/core/NEON/kernels/convolution/winograd/transforms') diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp new file mode 100644 index 0000000000..6d8afc0def --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 3, 3>::InputTransform; + +/****************************************************************************** + * Cost methods for the input transform. + * ===================================== + */ +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &input_shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows); + const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols); + return 16 * 16 * tile_M * tile_N * input_shape.n_channels; +} +/*****************************************************************************/ + +/***************************************************************************** +* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a +* variety of padding types. For example, tiles at the top and left of an image +* can require one row or column of padding on their top and left sides if the +* padding type is SAME (where X represents a padded value): +* +* _______ _______ +* |X X X X| |X X X X| +* |X | | | . . . +* |X | | | +* |X______| |_______| +* _______ +* |X | . +* |X | . . . . +* |X | . +* |X______| +* +* For tiles near the right or bottom of the image it is more complicated. Such +* tiles might require padding by 0 or 1 rows or columns if the padding type is +* VALID or 1 or 2 rows or columns if the padding type is SAME: +* +* _______ _______ _______ _______ +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X______| |_______| |______X| |____X_X| +* _______ _______ _______ _______ +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X______| |_______| |______X| |____X_X| +* _______ _______ _______ _______ +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* _______ _______ _______ _______ +* |X | | | | X| | X X| +* |X | | | | X| | X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* Additional tiles are required for especially small input images. +* +* Build an array of the specialised methods that deal with each of the +* different padding combinations which may be required. These padding +* constraints are the space: +* +* Padding top in {0, 1} +* Padding left in {0, 1} +* Padding bottom in {0, 1, 2} +* Padding right in {0, 1, 2} +*/ +template <> +template <> +template +void Transform::process_tile( + int n_channels, + const float* const input_base, + const int input_row_stride, + const int input_col_stride, + float* const matrix_base, + const int matrix_stride +) +{ + constexpr int inner_tile_i = 4, inner_tile_j = 4; + constexpr int cells_i = inner_tile_i - pad_bottom; + constexpr int cells_j = inner_tile_i - pad_right; + + float *outptr = matrix_base; + + // Get pointers into the input tile + const float *x_ptrs[inner_tile_i][inner_tile_j]; + for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) + { + // Get a pointer into the row + const float* const row_ptr = input_base + xi*input_row_stride; + + for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) + { + x_ptrs[i][j] = row_ptr + xj*input_col_stride; + } + } + + // Matrices used/computed in this kernel. + float x[inner_tile_i][inner_tile_j]; + float XTx[inner_tile_i][inner_tile_j]; + float U[inner_tile_i][inner_tile_j]; + + for (int i = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++) + { + x[i][j] = XTx[i][j] = 0.0f; + } + } + + // Perform the Winograd input transformation for each channel in the input + // tensor. + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used/computed in this kernel. + float32x4_t x[inner_tile_i][inner_tile_j]; + float32x4_t XTx[inner_tile_i][inner_tile_j]; + float32x4_t U[inner_tile_i][inner_tile_j]; + + for (int i = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++) + { + x[i][j] = vdupq_n_f32(0.0f); + XTx[i][j] = vdupq_n_f32(0.0f); + } + } + + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1q_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 4; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = x[0][j] - x[2][j]; + XTx[0][j] = vsubq_f32(x[0][j], x[2][j]); + + // XTx[1][j] = x[1][j] + x[2][j]; + XTx[1][j] = vaddq_f32(x[1][j], x[2][j]); + + // XTx[2][j] = x[2][j] - x[1][j]; + XTx[2][j] = vsubq_f32(x[2][j], x[1][j]); + + // XTx[3][j] = x[1][j] - x[3][j]; + XTx[3][j] = vsubq_f32(x[1][j], x[3][j]); + } + + // Compute U = XT . x . X + for (int i = 0; i < inner_tile_i; i++) + { + // U[i][0] = XTx[i][0] - XTx[i][2]; + U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]); + + // U[i][1] = XTx[i][1] + XTx[i][2]; + U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]); + + // U[i][2] = XTx[i][2] - XTx[i][1]; + U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]); + + // U[i][3] = XTx[i][1] - XTx[i][3]; + U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used/computed in this kernel. + float32x2_t x[inner_tile_i][inner_tile_j]; + float32x2_t XTx[inner_tile_i][inner_tile_j]; + float32x2_t U[inner_tile_i][inner_tile_j]; + + for (int i = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++) + { + x[i][j] = vdup_n_f32(0.0f); + XTx[i][j] = vdup_n_f32(0.0f); + } + } + + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 2; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = x[0][j] - x[2][j]; + XTx[0][j] = vsub_f32(x[0][j], x[2][j]); + + // XTx[1][j] = x[1][j] + x[2][j]; + XTx[1][j] = vadd_f32(x[1][j], x[2][j]); + + // XTx[2][j] = x[2][j] - x[1][j]; + XTx[2][j] = vsub_f32(x[2][j], x[1][j]); + + // XTx[3][j] = x[1][j] - x[3][j]; + XTx[3][j] = vsub_f32(x[1][j], x[3][j]); + } + + // Compute U = XT . x . X + for (int i = 0; i < inner_tile_i; i++) + { + // U[i][0] = XTx[i][0] - XTx[i][2]; + U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]); + + // U[i][1] = XTx[i][1] + XTx[i][2]; + U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]); + + // U[i][2] = XTx[i][2] - XTx[i][1]; + U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]); + + // U[i][3] = XTx[i][1] - XTx[i][3]; + U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = *(x_ptrs[i][j]++); + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + XTx[0][j] = x[0][j] - x[2][j]; + XTx[1][j] = x[1][j] + x[2][j]; + XTx[2][j] = x[2][j] - x[1][j]; + XTx[3][j] = x[1][j] - x[3][j]; + } + + // Compute U = XT . x . X + for (int i = 0; i < inner_tile_i; i++) + { + U[i][0] = XTx[i][0] - XTx[i][2]; + U[i][1] = XTx[i][1] + XTx[i][2]; + U[i][2] = XTx[i][2] - XTx[i][1]; + U[i][3] = XTx[i][1] - XTx[i][3]; + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + *(outptr + m*matrix_stride) = U[i][j]; + } + } + outptr++; + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = +{ + { + { + { + Transform::template process_tile<0, 0, 0, 0>, // No padding + Transform::template process_tile<0, 0, 0, 1>, // Right + Transform::template process_tile<0, 0, 0, 2>, // Right + }, + { + Transform::template process_tile<0, 0, 1, 0>, // Bottom + Transform::template process_tile<0, 0, 1, 1>, // Bottom-right + Transform::template process_tile<0, 0, 1, 2>, // Bottom-right + }, + { + Transform::template process_tile<0, 0, 2, 0>, // Bottom + Transform::template process_tile<0, 0, 2, 1>, // Bottom-right + Transform::template process_tile<0, 0, 2, 2>, // Bottom-right + } + }, + { + { + Transform::template process_tile<0, 1, 0, 0>, // Left + Transform::template process_tile<0, 1, 0, 1>, // Left AND right + Transform::template process_tile<0, 1, 0, 2>, // Left AND right + }, + { + Transform::template process_tile<0, 1, 1, 0>, // Left-bottom + Transform::template process_tile<0, 1, 1, 1>, // Left, bottom AND right + Transform::template process_tile<0, 1, 1, 2>, // Left, bottom AND right + }, + { + Transform::template process_tile<0, 1, 2, 0>, // Left-bottom + Transform::template process_tile<0, 1, 2, 1>, // Left, bottom AND right + Transform::template process_tile<0, 1, 2, 2>, // Left, bottom AND right + } + }, + }, + { + { + { + Transform::template process_tile<1, 0, 0, 0>, // Top + Transform::template process_tile<1, 0, 0, 1>, // Top-right + Transform::template process_tile<1, 0, 0, 2>, // Top-right + }, + { + Transform::template process_tile<1, 0, 1, 0>, // Top AND bottom + Transform::template process_tile<1, 0, 1, 1>, // Top, bottom AND right + Transform::template process_tile<1, 0, 1, 2>, // Top, bottom AND right + }, + { + Transform::template process_tile<1, 0, 2, 0>, // Top AND bottom + Transform::template process_tile<1, 0, 2, 1>, // Top, bottom AND right + Transform::template process_tile<1, 0, 2, 2>, // Top, bottom AND right + } + }, + { + { + Transform::template process_tile<1, 1, 0, 0>, // Top-left + Transform::template process_tile<1, 1, 0, 1>, // Top, left AND right + Transform::template process_tile<1, 1, 0, 2>, // Top, left AND right + }, + { + Transform::template process_tile<1, 1, 1, 0>, // Top, left AND bottom + Transform::template process_tile<1, 1, 1, 1>, // All padded + Transform::template process_tile<1, 1, 1, 2>, // All padded + }, + { + Transform::template process_tile<1, 1, 2, 0>, // Top, left AND bottom + Transform::template process_tile<1, 1, 2, 1>, // All padded + Transform::template process_tile<1, 1, 2, 2>, // All padded + } + } + } +}; + +template struct WinogradGEMM<2, 2, 3, 3>::InputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp new file mode 100644 index 0000000000..d9ebe8b7cd --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 5, 5>::InputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &input_shape) +{ + return 0; // TODO +} + +/***************************************************************************** +* F(2x2, 5x5) implies the use of a 6x6 input tile. +* +* Build an array of the specialised methods that deal with each of the +* different padding combinations which may be required. These padding +* constraints are the space: +* +* Padding top in {0, 1} +* Padding left in {0, 1} +* Padding bottom in {0, 1, 2, 3, 4} +* Padding right in {0, 1, 2, 3, 4} +*/ +template <> +template <> +template +void Transform::process_tile( + int n_channels, + const float* const input_base, + const int input_row_stride, + const int input_col_stride, + float* const matrix_base, + const int matrix_stride +) +{ + constexpr int cells_i = 6 - pad_bottom; + constexpr int cells_j = 6 - pad_right; + + float *outptr = matrix_base; + + // Get pointers into the input tile + const float *x_ptrs[6][6]; + for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) + { + // Get a pointer into the row + const float* const row_ptr = input_base + xi*input_row_stride; + + for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) + { + x_ptrs[i][j] = row_ptr + xj*input_col_stride; + } + } + + // Matrices used/computed in this kernel. + float x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = XTx[i][j] = 0.0f; + } + } + + // Perform the Winograd input transformation for each channel in the input + // tensor. + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used/computed in this kernel + float32x4_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdupq_n_f32(0.0f); + XTx[i][j] = vdupq_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1q_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 4; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used/computed in this kernel + float32x2_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdup_n_f32(0.0f); + XTx[i][j] = vdup_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 2; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = *(x_ptrs[i][j]++); + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = U[i][j]; + } + } + outptr++; + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = +{ + { + { + { + Transform::template process_tile<0, 0, 0, 0>, // No padding + Transform::template process_tile<0, 0, 0, 1>, // Right + Transform::template process_tile<0, 0, 0, 2>, // " " + Transform::template process_tile<0, 0, 0, 3>, // " " + Transform::template process_tile<0, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 1, 0>, // Bottom + Transform::template process_tile<0, 0, 1, 1>, // Bottom right + Transform::template process_tile<0, 0, 1, 2>, // " " + Transform::template process_tile<0, 0, 1, 3>, // " " + Transform::template process_tile<0, 0, 1, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 2, 0>, // Bottom + Transform::template process_tile<0, 0, 2, 1>, // Bottom right + Transform::template process_tile<0, 0, 2, 2>, // " " + Transform::template process_tile<0, 0, 2, 3>, // " " + Transform::template process_tile<0, 0, 2, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 3, 0>, // Bottom + Transform::template process_tile<0, 0, 3, 1>, // Bottom right + Transform::template process_tile<0, 0, 3, 2>, // " " + Transform::template process_tile<0, 0, 3, 3>, // " " + Transform::template process_tile<0, 0, 3, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 4, 0>, // Bottom + Transform::template process_tile<0, 0, 4, 1>, // Bottom right + Transform::template process_tile<0, 0, 4, 2>, // " " + Transform::template process_tile<0, 0, 4, 3>, // " " + Transform::template process_tile<0, 0, 4, 4>, // " " + } + }, + { + { + Transform::template process_tile<0, 1, 0, 0>, // Left + Transform::template process_tile<0, 1, 0, 1>, + Transform::template process_tile<0, 1, 0, 2>, + Transform::template process_tile<0, 1, 0, 3>, + Transform::template process_tile<0, 1, 0, 4>, + }, + { + Transform::template process_tile<0, 1, 1, 0>, // Bottom left + Transform::template process_tile<0, 1, 1, 1>, + Transform::template process_tile<0, 1, 1, 2>, + Transform::template process_tile<0, 1, 1, 3>, + Transform::template process_tile<0, 1, 1, 4>, + }, + { + Transform::template process_tile<0, 1, 2, 0>, // " " + Transform::template process_tile<0, 1, 2, 1>, + Transform::template process_tile<0, 1, 2, 2>, + Transform::template process_tile<0, 1, 2, 3>, + Transform::template process_tile<0, 1, 2, 4>, + }, + { + Transform::template process_tile<0, 1, 3, 0>, // " " + Transform::template process_tile<0, 1, 3, 1>, + Transform::template process_tile<0, 1, 3, 2>, + Transform::template process_tile<0, 1, 3, 3>, + Transform::template process_tile<0, 1, 3, 4>, + }, + { + Transform::template process_tile<0, 1, 4, 0>, // " " + Transform::template process_tile<0, 1, 4, 1>, + Transform::template process_tile<0, 1, 4, 2>, + Transform::template process_tile<0, 1, 4, 3>, + Transform::template process_tile<0, 1, 4, 4>, + } + } + }, + { + { + { + Transform::template process_tile<1, 0, 0, 0>, // Top + Transform::template process_tile<1, 0, 0, 1>, // Top right + Transform::template process_tile<1, 0, 0, 2>, // " " + Transform::template process_tile<1, 0, 0, 3>, // " " + Transform::template process_tile<1, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<1, 0, 1, 0>, + Transform::template process_tile<1, 0, 1, 1>, + Transform::template process_tile<1, 0, 1, 2>, + Transform::template process_tile<1, 0, 1, 3>, + Transform::template process_tile<1, 0, 1, 4>, + }, + { + Transform::template process_tile<1, 0, 2, 0>, + Transform::template process_tile<1, 0, 2, 1>, + Transform::template process_tile<1, 0, 2, 2>, + Transform::template process_tile<1, 0, 2, 3>, + Transform::template process_tile<1, 0, 2, 4>, + }, + { + Transform::template process_tile<1, 0, 3, 0>, + Transform::template process_tile<1, 0, 3, 1>, + Transform::template process_tile<1, 0, 3, 2>, + Transform::template process_tile<1, 0, 3, 3>, + Transform::template process_tile<1, 0, 3, 4>, + }, + { + Transform::template process_tile<1, 0, 4, 0>, + Transform::template process_tile<1, 0, 4, 1>, + Transform::template process_tile<1, 0, 4, 2>, + Transform::template process_tile<1, 0, 4, 3>, + Transform::template process_tile<1, 0, 4, 4>, + }, + }, + { + { + Transform::template process_tile<1, 1, 0, 0>, // Top left + Transform::template process_tile<1, 1, 0, 1>, + Transform::template process_tile<1, 1, 0, 2>, + Transform::template process_tile<1, 1, 0, 3>, + Transform::template process_tile<1, 1, 0, 4>, + }, + { + Transform::template process_tile<1, 1, 1, 0>, + Transform::template process_tile<1, 1, 1, 1>, + Transform::template process_tile<1, 1, 1, 2>, + Transform::template process_tile<1, 1, 1, 3>, + Transform::template process_tile<1, 1, 1, 4>, + }, + { + Transform::template process_tile<1, 1, 2, 0>, + Transform::template process_tile<1, 1, 2, 1>, + Transform::template process_tile<1, 1, 2, 2>, + Transform::template process_tile<1, 1, 2, 3>, + Transform::template process_tile<1, 1, 2, 4>, + }, + { + Transform::template process_tile<1, 1, 3, 0>, + Transform::template process_tile<1, 1, 3, 1>, + Transform::template process_tile<1, 1, 3, 2>, + Transform::template process_tile<1, 1, 3, 3>, + Transform::template process_tile<1, 1, 3, 4>, + }, + { + Transform::template process_tile<1, 1, 4, 0>, + Transform::template process_tile<1, 1, 4, 1>, + Transform::template process_tile<1, 1, 4, 2>, + Transform::template process_tile<1, 1, 4, 3>, + Transform::template process_tile<1, 1, 4, 4>, + } + } + } +}; + +template struct WinogradGEMM<2, 2, 5, 5>::InputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp new file mode 100644 index 0000000000..04d1573e4c --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<4, 4, 3, 3>::InputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &input_shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows); + const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols); + return 12 * 24 * tile_M * tile_N * input_shape.n_channels; +} + +/* F(4x4, 3x3) implies the use of a 6x6 input tile. Such tiles can require a +* variety of padding types. For example, tiles at the top and left of an +* image can require one row or column of padding on their top and left sides +* if the padding type is SAME (where X represents a padded value): +* +* ___________ ___________ +* |X X X X X X| |X X X X X X| +* |X | | | +* |X | | | +* |X | | | +* |X | | | +* |X__________| |___________| +* ___________ +* |X | +* |X | +* |X | +* |X | +* |X | +* |X__________| +* +* For tiles near the right or bottom of the image it is more complicated. +* Such tiles might require padding by 0, 1, 2 or 3 rows or columns if the +* padding type is VALID or 1, 2, 3 or 4 rows or columns if the padding +* type is SAME. +* +* Build an array of the specialised methods that deal with each of the +* different padding combinations which may be required. These padding +* constraints are the space: +* +* Padding top in {0, 1} +* Padding left in {0, 1} +* Padding bottom in {0, 1, 2, 3, 4} +* Padding right in {0, 1, 2, 3, 4} +*/ +template <> +template <> +template +void Transform::process_tile( + int n_channels, + const float* const input_base, + const int input_row_stride, + const int input_col_stride, + float* const matrix_base, + const int matrix_stride +) +{ + constexpr int cells_i = 6 - pad_bottom; + constexpr int cells_j = 6 - pad_right; + + float *outptr = matrix_base; + + // Get pointers into the input tile + const float *x_ptrs[6][6]; + for (int i = pad_top, xi = 0; i < cells_i; i++, xi++) + { + // Get a pointer into the row + const float* const row_ptr = input_base + xi*input_row_stride; + + for (int j = pad_left, xj = 0; j < cells_j; j++, xj++) + { + x_ptrs[i][j] = row_ptr + xj*input_col_stride; + } + } + + // Matrices used/computed in this kernel. + float x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = XTx[i][j] = 0.0f; + } + } + + // Perform the Winograd input transformation for each channel in the input + // tensor. + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used/computed in this kernel + float32x4_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdupq_n_f32(0.0f); + XTx[i][j] = vdupq_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1q_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 4; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used/computed in this kernel + float32x2_t x[6][6], XTx[6][6], U[6][6]; + for (int i = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++) + { + x[i][j] = vdup_n_f32(0.0f); + XTx[i][j] = vdup_n_f32(0.0f); + } + } + + // Read a 6x6 tile in the Winograd domain + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = vld1_f32(x_ptrs[i][j]); + x_ptrs[i][j] += 2; + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + // XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f); + + // XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f); + + // XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f); + + // XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f); + + // XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f); + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + // U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f); + + // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f); + + // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f); + + // U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f); + + // U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f); + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, U[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Load x + for (int i = pad_top; i < cells_i; i++) + { + for (int j = pad_left; j < cells_j; j++) + { + x[i][j] = *(x_ptrs[i][j]++); + } + } + + // Compute XT . x + for (int j = pad_left; j < cells_j; j++) + { + XTx[0][j] = 4*x[0][j] + -5*x[2][j] + 1*x[4][j]; + XTx[1][j] = -4*x[1][j] + -4*x[2][j] + 1*x[3][j] + 1*x[4][j]; + XTx[2][j] = 4*x[1][j] + -4*x[2][j] + -1*x[3][j] + 1*x[4][j]; + XTx[3][j] = -2*x[1][j] + -1*x[2][j] + 2*x[3][j] + 1*x[4][j]; + XTx[4][j] = 2*x[1][j] + -1*x[2][j] + -2*x[3][j] + 1*x[4][j]; + XTx[5][j] = 4*x[1][j] + -5*x[3][j] + 1*x[5][j]; + } + + // Compute U = XT . x . X + for (int i = 0; i < 6; i++) + { + U[i][0] = 4*XTx[i][0] + -5*XTx[i][2] + 1*XTx[i][4]; + U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] + 1*XTx[i][3] + 1*XTx[i][4]; + U[i][2] = 4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] + 1*XTx[i][4]; + U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] + 2*XTx[i][3] + 1*XTx[i][4]; + U[i][4] = 2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] + 1*XTx[i][4]; + U[i][5] = 4*XTx[i][1] + -5*XTx[i][3] + 1*XTx[i][5]; + } + + // Store the transformed matrix + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = U[i][j]; + } + } + outptr++; + } +} + +/* In the below, unusual or especially small tiles are routed via the slow + * path whereas common or large tiles are routed through a faster path. + */ +template <> +template <> +const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] = +{ + { + { + { + Transform::template process_tile<0, 0, 0, 0>, // No padding + Transform::template process_tile<0, 0, 0, 1>, // Right + Transform::template process_tile<0, 0, 0, 2>, // " " + Transform::template process_tile<0, 0, 0, 3>, // " " + Transform::template process_tile<0, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 1, 0>, // Bottom + Transform::template process_tile<0, 0, 1, 1>, // Bottom right + Transform::template process_tile<0, 0, 1, 2>, // " " + Transform::template process_tile<0, 0, 1, 3>, // " " + Transform::template process_tile<0, 0, 1, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 2, 0>, // Bottom + Transform::template process_tile<0, 0, 2, 1>, // Bottom right + Transform::template process_tile<0, 0, 2, 2>, // " " + Transform::template process_tile<0, 0, 2, 3>, // " " + Transform::template process_tile<0, 0, 2, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 3, 0>, // Bottom + Transform::template process_tile<0, 0, 3, 1>, // Bottom right + Transform::template process_tile<0, 0, 3, 2>, // " " + Transform::template process_tile<0, 0, 3, 3>, // " " + Transform::template process_tile<0, 0, 3, 4>, // " " + }, + { + Transform::template process_tile<0, 0, 4, 0>, // Bottom + Transform::template process_tile<0, 0, 4, 1>, // Bottom right + Transform::template process_tile<0, 0, 4, 2>, // " " + Transform::template process_tile<0, 0, 4, 3>, // " " + Transform::template process_tile<0, 0, 4, 4>, // " " + } + }, + { + { + Transform::template process_tile<0, 1, 0, 0>, // Left + Transform::template process_tile<0, 1, 0, 1>, + Transform::template process_tile<0, 1, 0, 2>, + Transform::template process_tile<0, 1, 0, 3>, + Transform::template process_tile<0, 1, 0, 4>, + }, + { + Transform::template process_tile<0, 1, 1, 0>, // Bottom left + Transform::template process_tile<0, 1, 1, 1>, + Transform::template process_tile<0, 1, 1, 2>, + Transform::template process_tile<0, 1, 1, 3>, + Transform::template process_tile<0, 1, 1, 4>, + }, + { + Transform::template process_tile<0, 1, 2, 0>, // " " + Transform::template process_tile<0, 1, 2, 1>, + Transform::template process_tile<0, 1, 2, 2>, + Transform::template process_tile<0, 1, 2, 3>, + Transform::template process_tile<0, 1, 2, 4>, + }, + { + Transform::template process_tile<0, 1, 3, 0>, // " " + Transform::template process_tile<0, 1, 3, 1>, + Transform::template process_tile<0, 1, 3, 2>, + Transform::template process_tile<0, 1, 3, 3>, + Transform::template process_tile<0, 1, 3, 4>, + }, + { + Transform::template process_tile<0, 1, 4, 0>, // " " + Transform::template process_tile<0, 1, 4, 1>, + Transform::template process_tile<0, 1, 4, 2>, + Transform::template process_tile<0, 1, 4, 3>, + Transform::template process_tile<0, 1, 4, 4>, + } + } + }, + { + { + { + Transform::template process_tile<1, 0, 0, 0>, // Top + Transform::template process_tile<1, 0, 0, 1>, // Top right + Transform::template process_tile<1, 0, 0, 2>, // " " + Transform::template process_tile<1, 0, 0, 3>, // " " + Transform::template process_tile<1, 0, 0, 4>, // " " + }, + { + Transform::template process_tile<1, 0, 1, 0>, + Transform::template process_tile<1, 0, 1, 1>, + Transform::template process_tile<1, 0, 1, 2>, + Transform::template process_tile<1, 0, 1, 3>, + Transform::template process_tile<1, 0, 1, 4>, + }, + { + Transform::template process_tile<1, 0, 2, 0>, + Transform::template process_tile<1, 0, 2, 1>, + Transform::template process_tile<1, 0, 2, 2>, + Transform::template process_tile<1, 0, 2, 3>, + Transform::template process_tile<1, 0, 2, 4>, + }, + { + Transform::template process_tile<1, 0, 3, 0>, + Transform::template process_tile<1, 0, 3, 1>, + Transform::template process_tile<1, 0, 3, 2>, + Transform::template process_tile<1, 0, 3, 3>, + Transform::template process_tile<1, 0, 3, 4>, + }, + { + Transform::template process_tile<1, 0, 4, 0>, + Transform::template process_tile<1, 0, 4, 1>, + Transform::template process_tile<1, 0, 4, 2>, + Transform::template process_tile<1, 0, 4, 3>, + Transform::template process_tile<1, 0, 4, 4>, + }, + }, + { + { + Transform::template process_tile<1, 1, 0, 0>, // Top left + Transform::template process_tile<1, 1, 0, 1>, + Transform::template process_tile<1, 1, 0, 2>, + Transform::template process_tile<1, 1, 0, 3>, + Transform::template process_tile<1, 1, 0, 4>, + }, + { + Transform::template process_tile<1, 1, 1, 0>, + Transform::template process_tile<1, 1, 1, 1>, + Transform::template process_tile<1, 1, 1, 2>, + Transform::template process_tile<1, 1, 1, 3>, + Transform::template process_tile<1, 1, 1, 4>, + }, + { + Transform::template process_tile<1, 1, 2, 0>, + Transform::template process_tile<1, 1, 2, 1>, + Transform::template process_tile<1, 1, 2, 2>, + Transform::template process_tile<1, 1, 2, 3>, + Transform::template process_tile<1, 1, 2, 4>, + }, + { + Transform::template process_tile<1, 1, 3, 0>, + Transform::template process_tile<1, 1, 3, 1>, + Transform::template process_tile<1, 1, 3, 2>, + Transform::template process_tile<1, 1, 3, 3>, + Transform::template process_tile<1, 1, 3, 4>, + }, + { + Transform::template process_tile<1, 1, 4, 0>, + Transform::template process_tile<1, 1, 4, 1>, + Transform::template process_tile<1, 1, 4, 2>, + Transform::template process_tile<1, 1, 4, 3>, + Transform::template process_tile<1, 1, 4, 4>, + } + } + } +}; + +template struct WinogradGEMM<4, 4, 3, 3>::InputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp new file mode 100644 index 0000000000..a95ce0e7d2 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(shape.n_rows, 2); + const int tile_N = iceildiv(shape.n_cols, 2); + return 24 * tile_M * tile_N * shape.n_channels; +} + +/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use + * enough tiles to cover the output space each output tile may contain 0 or 1 + * padded values to the right and bottom columns or rows of the tile, e.g.: + * + * ___ ___ + * | | | X| + * |___| |__X| + * + * ___ ___ + * | | | X| + * |X_X| |X_X| + * + * + * We provide a specialised output transform for each of these instances. + * Consequently we below construct an array of the various padding options, the + * array contains pointers to the specific implementations. + */ +template <> +template <> +template +void Transform::process_tile( + const int n_channels, + const float* const matrix_base, + const int matrix_stride, + const float* const biases, + float* const output, + const int output_row_stride, + const int output_col_stride +) +{ + constexpr int cells_i = 2 - pad_bottom; + constexpr int cells_j = 2 - pad_right; + + // Construct a map to the output cells + float *outptrs[cells_i][cells_j]; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; + } + } + const float *inptr = matrix_base; + const float *bptr = biases; + + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed during this transform + float32x4_t F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } + } + inptr += 4; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]); + + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } + + // Load the bias vector + b = vld1q_f32(bptr); + bptr += 4; + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } + } + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]); + + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } + + // Load the bias vector + b = vld1_f32(bptr); + bptr += 2; + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed during this transform + float F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + } + + // Load the bias + b = *(bptr++); + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } + } + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +{ + { + Transform::template process_tile<0, 0>, // No padding + Transform::template process_tile<0, 1>, // Right padding + }, + { + Transform::template process_tile<1, 0>, // Bottom padding + Transform::template process_tile<1, 1>, // Bottom and right padding + } +}; + +template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp new file mode 100644 index 0000000000..262f71118c --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &shape) +{ + return 0; // TODO +} + +/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use + * enough tiles to cover the output space each output tile may contain 0 or 1 + * padded values to the right and bottom columns or rows of the tile, e.g.: + * + * ___ ___ + * | | | X| + * |___| |__X| + * + * ___ ___ + * | | | X| + * |X_X| |X_X| + * + * + * We provide a specialised output transform for each of these instances. + * Consequently we below construct an array of the various padding options, the + * array contains pointers to the specific implementations. + */ +template <> +template <> +template +void Transform::process_tile( + const int n_channels, + const float* const matrix_base, + const int matrix_stride, + const float* const biases, + float* const output, + const int output_row_stride, + const int output_col_stride +) +{ + constexpr int cells_i = 2 - pad_bottom; + constexpr int cells_j = 2 - pad_right; + + // Construct a map to the output cells + float *outptrs[cells_i][cells_j]; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; + } + } + const float *inptr = matrix_base; + const float *bptr = biases; + + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } + } + inptr += 4; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1q_f32(bptr); + bptr += 4; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } + } + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1_f32(bptr); + bptr += 2; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed during this transform + float F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + b = *(bptr++); + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } + } + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +{ + { + Transform::template process_tile<0, 0>, // No padding + Transform::template process_tile<0, 1>, // Right padding + }, + { + Transform::template process_tile<1, 0>, // Bottom padding + Transform::template process_tile<1, 1>, // Bottom and right padding + } +}; + +template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp new file mode 100644 index 0000000000..8f47736f0c --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" + +namespace winograd +{ + +using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform; + +template <> +template <> +int Transform::ops_performed(const Tensor4DShape &shape) +{ + // NOTE: Cost in FLOPs rather than instructions or uops. + const int tile_M = iceildiv(shape.n_rows, 4); + const int tile_N = iceildiv(shape.n_cols, 4); + return 170 * tile_M * tile_N * shape.n_channels; +} + +// Instantiate cost methods +template int Transform::ops_performed(const Tensor4DShape&); + +/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use + * enough tiles to cover the output space each output tile may contain up to 3 + * padded values to the right and bottom columns or rows of the tile, e.g.: +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* |_______| |______X| |____X_X| |__X_X_X| +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* | | | X| | X X| | X X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* ________ ________ ________ ________ +* | | | X| | X X| | X X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X X X X| |X X X X| |X X X X| |X X X X| +* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| +* +* +* We provide a specialised output transform for each of these instances. +*/ +template <> +template <> +template +void Transform::process_tile( + const int n_channels, + const float* const matrix_base, + const int matrix_stride, + const float* const biases, + float* const output, + const int output_row_stride, + const int output_col_stride +) +{ + constexpr int cells_i = 4 - pad_bottom; + constexpr int cells_j = 4 - pad_right; + + // Construct a map to the output cells + float *outptrs[cells_i][cells_j]; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + outptrs[i][j] = output + i*output_row_stride + j*output_col_stride; + } + } + const float *inptr = matrix_base; + const float *bptr = biases; + + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } + } + inptr += 4; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f); + + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f); + + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f); + + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f); + + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1q_f32(bptr); + bptr += 4; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } + } + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f); + + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f); + + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f); + + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f); + + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } + + // Write out the output tile + b = vld1_f32(bptr); + bptr += 2; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed during this transform + float F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + b = *(bptr++); + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } + } + } +} + +template <> +template <> +const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +{ + { + Transform::template process_tile<0, 0>, + Transform::template process_tile<0, 1>, + Transform::template process_tile<0, 2>, + Transform::template process_tile<0, 3>, + }, + { + Transform::template process_tile<1, 0>, + Transform::template process_tile<1, 1>, + Transform::template process_tile<1, 2>, + Transform::template process_tile<1, 3>, + }, + { + Transform::template process_tile<2, 0>, + Transform::template process_tile<2, 1>, + Transform::template process_tile<2, 2>, + Transform::template process_tile<2, 3>, + }, + { + Transform::template process_tile<3, 0>, + Transform::template process_tile<3, 1>, + Transform::template process_tile<3, 2>, + Transform::template process_tile<3, 3>, + } +}; + +template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp new file mode 100644 index 0000000000..6c71461f81 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp" + +namespace winograd +{ + template <> + template <> + void WinogradGEMM<2, 2, 3, 3>::WeightsTransform::execute( + const int n_output_channels, + const int n_input_channels, + const float* const input, + float* const output, + const int matrix_stride, + const int matrix_row_stride + ) + { + constexpr int inner_tile_i = 4; + constexpr int inner_tile_j = 4; + + // Get pointers to each cell of the weight tensor + const auto weight_col_stride = n_input_channels * n_output_channels; + const auto weight_row_stride = 3 * weight_col_stride; + const float *inptrs[3][3]; + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; + } + } + + // For each input channel + for (int ic = 0; ic < n_input_channels; ic++) + { + float *outptr = output + ic * matrix_row_stride; + + // For each output channel + int channels_remaining = n_output_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed in this kernel + float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1q_f32(inptrs[i][j]); + inptrs[i][j] += 4; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = w[0][j]; + + // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); + Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); + Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + Ww[3][j] = w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < inner_tile_i; i++) + { + V[i][0] = Ww[i][0]; + + // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); + V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); + V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + V[i][3] = Ww[i][2]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed in this kernel + float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1_f32(inptrs[i][j]); + inptrs[i][j] += 2; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = w[0][j]; + + // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); + Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); + Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f); + + Ww[3][j] = w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < inner_tile_i; i++) + { + V[i][0] = Ww[i][0]; + + // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); + V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); + V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f); + + V[i][3] = Ww[i][2]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed in this kernel + float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = *(inptrs[i][j]++); + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = w[0][j]; + Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]); + Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]); + Ww[3][j] = w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < inner_tile_i; i++) + { + V[i][0] = Ww[i][0]; + V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]); + V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]); + V[i][3] = Ww[i][2]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < inner_tile_i; i++) + { + for (int j = 0; j < inner_tile_j; j++, m++) + { + *(outptr + m*matrix_stride) = V[i][j]; + } + } + outptr++; + } + } + } + + template <> + template <> + int WinogradGEMM<2, 2, 3, 3>::WeightsTransform::ops_performed(const KernelShape &shape) + { + const int channel_prod = shape.n_input_channels * shape.n_output_channels; + return 2 * 18 * channel_prod; + } + + template struct WinogradGEMM<2, 2, 3, 3>::WeightsTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp new file mode 100644 index 0000000000..2f4f6e1ba2 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp" + +namespace winograd +{ + template <> + template <> + void WinogradGEMM<2, 2, 5, 5>::WeightsTransform::execute( + const int n_output_channels, + const int n_input_channels, + const float* const input, + float* const output, + const int matrix_stride, + const int matrix_row_stride + ) + { + // Get pointers to each cell of the weight tensor + const auto weight_col_stride = n_input_channels * n_output_channels; + const auto weight_row_stride = 5 * weight_col_stride; + const float *inptrs[5][5]; + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; + } + } + + // For each input channel + for (int ic = 0; ic < n_input_channels; ic++) + { + float *outptr = output + ic * matrix_row_stride; + + // For each output channel + int channels_remaining = n_output_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed in this kernel + float32x4_t w[5][5], Ww[6][5], V[6][6]; + + // Read weights + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + w[i][j] = vld1q_f32(inptrs[i][j]); + inptrs[i][j] += 4; + } + } + + // Compute the matrix W w + for (int j = 0; j < 5; j++) + { + // Ww[0][j] = w[0][j]/4.0f; + Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f); + + // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; + Ww[1][j] = vmulq_n_f32( + vaddq_f32( + vaddq_f32( + vaddq_f32(w[1][j], w[0][j]), + vaddq_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + -1.0f/6.0f + ); + + // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; + // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f; + Ww[2][j] = vmulq_n_f32( + vsubq_f32( + vaddq_f32( + vsubq_f32(w[1][j], w[0][j]), + vsubq_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + 1.0f/6.0f + ); + + // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; + Ww[3][j] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)), + vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; + Ww[4][j] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)), + vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[5][j] = w[4][j]; + Ww[5][j] = w[4][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + // V[i][0] = Ww[i][0]/4.0f; + V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f); + + // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; + V[i][1] = vmulq_n_f32( + vaddq_f32( + vaddq_f32( + vaddq_f32(Ww[i][1], Ww[i][0]), + vaddq_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + -1.0f/6.0f + ); + + // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; + // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f; + V[i][2] = vmulq_n_f32( + vsubq_f32( + vaddq_f32( + vsubq_f32(Ww[i][1], Ww[i][0]), + vsubq_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + 1.0f/6.0f + ); + + // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][3] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)), + vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][4] = vmulq_n_f32( + vmlaq_n_f32( + vaddq_f32( + vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)), + vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][5] = Ww[i][4]; + V[i][5] = Ww[i][4]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed in this kernel + float32x2_t w[5][5], Ww[6][5], V[6][6]; + + // Read weights + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + w[i][j] = vld1_f32(inptrs[i][j]); + inptrs[i][j] += 2; + } + } + + // Compute the matrix W w + for (int j = 0; j < 5; j++) + { + // Ww[0][j] = w[0][j]/4.0f; + Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f); + + // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; + Ww[1][j] = vmul_n_f32( + vadd_f32( + vadd_f32( + vadd_f32(w[1][j], w[0][j]), + vadd_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + -1.0f/6.0f + ); + + // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; + // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f; + Ww[2][j] = vmul_n_f32( + vsub_f32( + vadd_f32( + vsub_f32(w[1][j], w[0][j]), + vsub_f32(w[3][j], w[2][j]) + ), + w[4][j] + ), + 1.0f/6.0f + ); + + // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; + Ww[3][j] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)), + vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; + Ww[4][j] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)), + vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j]) + ), + w[4][j], 2.0f + ), + 1.0f/3.0f + ); + + // Ww[5][j] = w[4][j]; + Ww[5][j] = w[4][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + // V[i][0] = Ww[i][0]/4.0f; + V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f); + + // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; + V[i][1] = vmul_n_f32( + vadd_f32( + vadd_f32( + vadd_f32(Ww[i][1], Ww[i][0]), + vadd_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + -1.0f/6.0f + ); + + // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; + // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f; + V[i][2] = vmul_n_f32( + vsub_f32( + vadd_f32( + vsub_f32(Ww[i][1], Ww[i][0]), + vsub_f32(Ww[i][3], Ww[i][2]) + ), + Ww[i][4] + ), + 1.0f/6.0f + ); + + // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][3] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)), + vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][4] = vmul_n_f32( + vmla_n_f32( + vadd_f32( + vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)), + vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3]) + ), + Ww[i][4], 2.0f + ), + 1.0f/3.0f + ); + + // V[i][5] = Ww[i][4]; + V[i][5] = Ww[i][4]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed in this kernel + float w[5][5], Ww[6][5], V[6][6]; + + // Read weights + for (int i = 0; i < 5; i++) + { + for (int j = 0; j < 5; j++) + { + w[i][j] = *(inptrs[i][j]++); + } + } + + // Compute the matrix W w + for (int j = 0; j < 5; j++) + { + Ww[0][j] = w[0][j]/4.0f; + Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f; + Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f; + Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f; + Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f; + Ww[5][j] = w[4][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + V[i][0] = Ww[i][0]/4.0f; + V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f; + V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f; + V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f; + V[i][5] = Ww[i][4]; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = V[i][j]; + } + } + outptr++; + } + } + } + + template <> + template <> + int WinogradGEMM<2, 2, 5, 5>::WeightsTransform::ops_performed(const KernelShape &shape) + { + return 0; // TODO + } + + template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform; +} // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp new file mode 100644 index 0000000000..a56a475fc9 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp" + +namespace winograd +{ + /* Float implementation for kernel transform F(4x4, 3x3) */ + template <> + template <> + void WinogradGEMM<4, 4, 3, 3>::WeightsTransform::execute( + const int n_output_channels, + const int n_input_channels, + const float* const input, // NOTE: Data in HWIO order + float* const output, + const int matrix_stride, + const int matrix_row_stride + ) + { + // Get pointers to each cell of the weight tensor + const auto weight_col_stride = n_input_channels * n_output_channels; + const auto weight_row_stride = 3 * weight_col_stride; + const float *inptrs[3][3]; + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride; + } + } + + // For each input channel + for (int ic = 0; ic < n_input_channels; ic++) + { + float *outptr = output + ic * matrix_row_stride; + + // For each output channel + int channels_remaining = n_output_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) + { + // Matrices used and computed in this kernel + float32x4_t w[3][3], Ww[6][3], V[6][6]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1q_f32(inptrs[i][j]); + inptrs[i][j] += 4; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + // Ww[0][j] = 6*w[0][j]; + Ww[0][j] = vmulq_n_f32(w[0][j], 6.0); + + // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; + Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0); + + // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; + Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0); + + // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; + Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; + Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[5][j] = 24*w[2][j]; + Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f); + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + const float recip576 = 1.0f / 576.0f; + + // V[i][0] = 6*Ww[i][0]; + V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576); + + // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]; + V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576); + + // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]; + V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576); + + // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]; + V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]; + V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][5] = 24*Ww[i][2]; + V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576); + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1q_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 4; + } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed in this kernel + float32x2_t w[3][3], Ww[6][3], V[6][6]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = vld1_f32(inptrs[i][j]); + inptrs[i][j] += 2; + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + // Ww[0][j] = 6*w[0][j]; + Ww[0][j] = vmul_n_f32(w[0][j], 6.0); + + // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; + Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0); + + // Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; + Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0); + + // Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; + Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; + Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f); + + // Ww[5][j] = 24*w[2][j]; + Ww[5][j] = vmul_n_f32(w[2][j], 24.0f); + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + const float recip576 = 1.0f / 576.0f; + + // V[i][0] = 6*Ww[i][0]; + V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576); + + // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]; + V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576); + + // V[i][2] = -4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]; + V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576); + + // V[i][3] = 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]; + V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][4] = 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]; + V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576); + + // V[i][5] = 24*Ww[i][2]; + V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576); + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + vst1_f32(outptr + m*matrix_stride, V[i][j]); + } + } + outptr += 2; + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) + { + // Matrices used and computed in this kernel + float w[3][3], Ww[6][3], V[6][6]; + + // Read weights + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + w[i][j] = *(inptrs[i][j]++); + } + } + + // Compute the matrix W w + for (int j = 0; j < 3; j++) + { + Ww[0][j] = 6*w[0][j]; + Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j]; + Ww[2][j] = -4*w[0][j] + 4*w[1][j] + -4*w[2][j]; + Ww[3][j] = 1*w[0][j] + 2*w[1][j] + 4*w[2][j]; + Ww[4][j] = 1*w[0][j] + -2*w[1][j] + 4*w[2][j]; + Ww[5][j] = 24*w[2][j]; + } + + // Compute V = W w WT + for (int i = 0; i < 6; i++) + { + V[i][0] = ( 6*Ww[i][0]) / 576.0; + V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0; + V[i][2] = (-4*Ww[i][0] + 4*Ww[i][1] + -4*Ww[i][2]) / 576.0; + V[i][3] = ( 1*Ww[i][0] + 2*Ww[i][1] + 4*Ww[i][2]) / 576.0; + V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] + 4*Ww[i][2]) / 576.0; + V[i][5] = (24*Ww[i][2]) / 576.0; + } + + // Store the transformed weights + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + *(outptr + m*matrix_stride) = V[i][j]; + } + } + outptr++; + } + } + } + + template <> + template <> + int WinogradGEMM<4, 4, 3, 3>::WeightsTransform::ops_performed(const KernelShape &shape) + { + const int channel_prod = shape.n_input_channels * shape.n_output_channels; + return 9 * 16 * channel_prod; + } + + template struct WinogradGEMM<4, 4, 3, 3>::WeightsTransform; +} -- cgit v1.2.1