From 8951933e5dd7be8d922affea3cc23a48a05b694d Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Fri, 17 Nov 2017 11:52:36 +0000 Subject: COMPMID-687: Winograd layer. Change-Id: Ica682d08e851491bf4a26b8d17908c014844055e Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110990 Reviewed-by: Anthony Barbier Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com --- .../transforms/output_2x2_3x3/a64_float.hpp | 650 ++++++++++++++++++++ .../output_2x2_3x3/a64_float_two_stage.hpp | 655 +++++++++++++++++++++ 2 files changed, 1305 insertions(+) create mode 100644 arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp create mode 100644 arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp (limited to 'arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3') diff --git a/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp b/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp new file mode 100644 index 0000000000..5925f9d569 --- /dev/null +++ b/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +/* Float implementation for AArch64. + */ +#ifdef __aarch64__ +namespace winograd { + + +template <> +template <> +inline void Winograd2x2_3x3GemmOutput::_execute( + const Tensor4DShape &output_shape, + float *output, + const float *input, + const int mstride, + const int matrix_row_stride +) { + const int tile_M = output_shape.n_rows / 2; + const int tile_N = output_shape.n_cols / 2; + int batch = output_shape.n_batches; + float *outptr = output; + + const float *inptr0 = input; + const float *inptr4 = input + 4 * mstride; + const float *inptr8 = input + 8 * mstride; + const float *inptr12 = input + 12 * mstride; + + const size_t col_stride = sizeof(float) * output_shape.n_channels; + const size_t row_stride = col_stride * tile_N * 2; + + asm volatile ( + // Aliases for elements of the input matrix `F` + // V-register Q-register + "F11 .req v0\n" "qF11 .req q0\n" + "F12 .req v1\n" "qF12 .req q1\n" + "F13 .req v2\n" "qF13 .req q2\n" + "F14 .req v3\n" "qF14 .req q3\n" + "F21 .req v4\n" "qF21 .req q4\n" + "F22 .req v5\n" "qF22 .req q5\n" + "F23 .req v6\n" "qF23 .req q6\n" + "F24 .req v7\n" "qF24 .req q7\n" + "F31 .req v8\n" "qF31 .req q8\n" + "F32 .req v9\n" "qF32 .req q9\n" + "F33 .req v10\n" "qF33 .req q10\n" + "F34 .req v11\n" "qF34 .req q11\n" + "F41 .req v12\n" "qF41 .req q12\n" + "F42 .req v13\n" "qF42 .req q13\n" + "F43 .req v14\n" "qF43 .req q14\n" + "F44 .req v15\n" "qF44 .req q15\n" + + // Aliases for elements of the intermediate matrix `FZ` + "FZ11 .req v16\n" + "FZ12 .req v17\n" + "FZ21 .req v18\n" + "FZ22 .req v19\n" + "FZ31 .req v20\n" + "FZ32 .req v21\n" + "FZ41 .req v22\n" + "FZ42 .req v23\n" + + // Aliases for elements of the output matrix `f` (called `g` due to case + // insensitivity of aliases). + " g11 .req v24\n" + "qg11 .req q24\n" + " g12 .req v25\n" + "qg12 .req q25\n" + " g21 .req v26\n" + "qg21 .req q26\n" + " g22 .req v27\n" + "qg22 .req q27\n" + + // Prepare the various strides + "col_stride .req %x[col_stride]\n" + "row_stride .req %x[row_stride]\n" + "row_plus_col_stride .req %x[row_plus_col_stride]\n" + + "mstride1 .req %x[mstride1]\n" + "mstride2 .req %x[mstride2]\n" + "mstride3 .req %x[mstride3]\n" + + "tile_i .req x19\n" // Tile row counter + "tile_j .req x20\n" // Tile column counter + "channel .req x21\n" // Channel counter + + "1:" // Loop over batches + "mov tile_i, %x[tile_M]\n" // Reset tile row counter + + "2:" // Loop over rows of tiles + "mov tile_j, %x[tile_N]\n" // Reset tile column counter + + "3:" // Loop over columns of tiles + // Perform initial loads of the matrix `F` + "ldr qF11, [%x[inptr0]]\n" + "ldr qF12, [%x[inptr0], mstride1]\n" + "ldr qF13, [%x[inptr0], mstride2]\n" + "ldr qF14, [%x[inptr0], mstride3]\n" + "add %x[inptr0], %x[inptr0], #0x10\n" + "ldr qF21, [%x[inptr4]]\n" + "ldr qF22, [%x[inptr4], mstride1]\n" + "subs channel, %x[n_channels], #4\n" // Reset channel counter + + "ldr qF23, [%x[inptr4], mstride2]\n" + "ldr qF24, [%x[inptr4], mstride3]\n" + "add %x[inptr4], %x[inptr4], #0x10\n" + "beq 5f\n" // Jump straight to tail if necessary + + "4:" // Loop over channels + "ldr qF31, [%x[inptr8]]\n" + "fadd FZ11.4s, F11.4s, F12.4s\n" + + "ldr qF32, [%x[inptr8], mstride1]\n" + "fsub FZ12.4s, F12.4s, F13.4s\n" + + "ldr qF33, [%x[inptr8], mstride2]\n" + "fadd FZ11.4s, FZ11.4s, F13.4s\n" + + "ldr qF34, [%x[inptr8], mstride3]\n" + "fsub FZ12.4s, FZ12.4s, F14.4s\n" + + "ldr qF41, [%x[inptr12]]\n" + "fadd FZ21.4s, F21.4s, F22.4s\n" + + "ldr qF42, [%x[inptr12], mstride1]\n" + "fsub FZ22.4s, F22.4s, F23.4s\n" + + "ldr qF43, [%x[inptr12], mstride2]\n" + "fadd FZ21.4s, FZ21.4s, F23.4s\n" + + "ldr qF44, [%x[inptr12], mstride3]\n" + "fsub FZ22.4s, FZ22.4s, F24.4s\n" + + "fadd FZ31.4s, F31.4s, F32.4s\n" + "add %x[inptr8], %x[inptr8], #0x10\n" + + "fsub FZ32.4s, F32.4s, F33.4s\n" + "add %x[inptr12], %x[inptr12], #0x10\n" + + "fadd FZ31.4s, FZ31.4s, F33.4s\n" + + "fsub FZ32.4s, FZ32.4s, F34.4s\n" + + "fadd g11.4s, FZ11.4s, FZ21.4s\n" + + "fadd g12.4s, FZ12.4s, FZ22.4s\n" + + "fadd g11.4s, g11.4s, FZ31.4s\n" + + "fadd g12.4s, g12.4s, FZ32.4s\n" + + "ldr qF11, [%x[inptr0]]\n" + "fadd FZ41.4s, F41.4s, F42.4s\n" + + "ldr qF12, [%x[inptr0], mstride1]\n" + "fsub g21.4s, FZ21.4s, FZ31.4s\n" + + "ldr qF13, [%x[inptr0], mstride2]\n" + "fsub FZ42.4s, F42.4s, F43.4s\n" + + "ldr qF14, [%x[inptr0], mstride3]\n" + "str qg11, [%x[outptr]]\n" + + "ldr qF21, [%x[inptr4]]\n" + "fadd FZ41.4s, FZ41.4s, F43.4s\n" + + "ldr qF22, [%x[inptr4], mstride1]\n" + "str qg12, [%x[outptr], col_stride]\n" + + "ldr qF23, [%x[inptr4], mstride2]\n" + "fsub FZ42.4s, FZ42.4s, F44.4s\n" + + "ldr qF24, [%x[inptr4], mstride3]\n" + "fsub g22.4s, FZ22.4s, FZ32.4s\n" + + "fsub g21.4s, g21.4s, FZ41.4s\n" + "add %x[inptr0], %x[inptr0], #0x10\n" + + "fsub g22.4s, g22.4s, FZ42.4s\n" + "add %x[inptr4], %x[inptr4], #0x10\n" + + "subs channel, channel, #4\n" + + "str qg21, [%x[outptr], row_stride]\n" + + "str qg22, [%x[outptr], row_plus_col_stride]\n" + + "add %x[outptr], %x[outptr], #0x10\n" + + "bne 4b\n" + + "5:" // Channel tail + "ldr qF31, [%x[inptr8]]\n" + "fadd FZ11.4s, F11.4s, F12.4s\n" + + "ldr qF32, [%x[inptr8], mstride1]\n" + "fsub FZ12.4s, F12.4s, F13.4s\n" + + "ldr qF33, [%x[inptr8], mstride2]\n" + "fadd FZ11.4s, FZ11.4s, F13.4s\n" + + "ldr qF34, [%x[inptr8], mstride3]\n" + "fsub FZ12.4s, FZ12.4s, F14.4s\n" + + "ldr qF41, [%x[inptr12]]\n" + "fadd FZ21.4s, F21.4s, F22.4s\n" + + "ldr qF42, [%x[inptr12], mstride1]\n" + "fsub FZ22.4s, F22.4s, F23.4s\n" + + "ldr qF43, [%x[inptr12], mstride2]\n" + "fadd FZ21.4s, FZ21.4s, F23.4s\n" + + "ldr qF44, [%x[inptr12], mstride3]\n" + "fsub FZ22.4s, FZ22.4s, F24.4s\n" + + "fadd FZ31.4s, F31.4s, F32.4s\n" + "add %x[inptr8], %x[inptr8], #0x10\n" + + "fsub FZ32.4s, F32.4s, F33.4s\n" + "add %x[inptr12], %x[inptr12], #0x10\n" + + "fadd FZ31.4s, FZ31.4s, F33.4s\n" + + "fsub FZ32.4s, FZ32.4s, F34.4s\n" + + "fadd g11.4s, FZ11.4s, FZ21.4s\n" + + "fadd g12.4s, FZ12.4s, FZ22.4s\n" + + "fadd g11.4s, g11.4s, FZ31.4s\n" + + "fadd g12.4s, g12.4s, FZ32.4s\n" + + "fadd FZ41.4s, F41.4s, F42.4s\n" + + "fsub g21.4s, FZ21.4s, FZ31.4s\n" + + "fsub FZ42.4s, F42.4s, F43.4s\n" + + "str qg11, [%x[outptr]]\n" + + "fadd FZ41.4s, FZ41.4s, F43.4s\n" + + "str qg12, [%x[outptr], col_stride]\n" + + "fsub FZ42.4s, FZ42.4s, F44.4s\n" + + "fsub g22.4s, FZ22.4s, FZ32.4s\n" + + "fsub g21.4s, g21.4s, FZ41.4s\n" + + "fsub g22.4s, g22.4s, FZ42.4s\n" + + "subs channel, channel, #4\n" + + "str qg21, [%x[outptr], row_stride]\n" + + // Progress input pointers to the next row of the matrix + "add %x[inptr0], %x[inptr0], %x[mrowpad]\n" + "add %x[inptr4], %x[inptr4], %x[mrowpad]\n" + "add %x[inptr8], %x[inptr8], %x[mrowpad]\n" + "add %x[inptr12], %x[inptr12], %x[mrowpad]\n" + + "str qg22, [%x[outptr], row_plus_col_stride]\n" + + "add %x[outptr], %x[outptr], #0x10\n" + + + "add %x[outptr], %x[outptr], col_stride\n" + "subs tile_j, tile_j, #1\n" + "bne 3b\n" + + "add %x[outptr], %x[outptr], row_stride\n" + "subs tile_i, tile_i, #1\n" + "bne 2b\n" + + "subs %[batch], %[batch], #1\n" + "bne 1b\n" + + ".unreq F11\n" ".unreq qF11\n" + ".unreq F12\n" ".unreq qF12\n" + ".unreq F13\n" ".unreq qF13\n" + ".unreq F14\n" ".unreq qF14\n" + ".unreq F21\n" ".unreq qF21\n" + ".unreq F22\n" ".unreq qF22\n" + ".unreq F23\n" ".unreq qF23\n" + ".unreq F24\n" ".unreq qF24\n" + ".unreq F31\n" ".unreq qF31\n" + ".unreq F32\n" ".unreq qF32\n" + ".unreq F33\n" ".unreq qF33\n" + ".unreq F34\n" ".unreq qF34\n" + ".unreq F41\n" ".unreq qF41\n" + ".unreq F42\n" ".unreq qF42\n" + ".unreq F43\n" ".unreq qF43\n" + ".unreq F44\n" ".unreq qF44\n" + + ".unreq FZ11\n" ".unreq FZ12\n" + ".unreq FZ21\n" ".unreq FZ22\n" + ".unreq FZ31\n" ".unreq FZ32\n" + ".unreq FZ41\n" ".unreq FZ42\n" + + ".unreq g11\n" ".unreq qg11\n" + ".unreq g12\n" ".unreq qg12\n" + ".unreq g21\n" ".unreq qg21\n" + ".unreq g22\n" ".unreq qg22\n" + + ".unreq col_stride\n" + ".unreq row_stride\n" + ".unreq row_plus_col_stride\n" + + ".unreq mstride1\n" + ".unreq mstride2\n" + ".unreq mstride3\n" + + ".unreq tile_i \n" + ".unreq tile_j \n" + ".unreq channel\n" + + : [batch] "+r" (batch), + [outptr] "+r" (outptr), + [inptr0] "+r" (inptr0), + [inptr4] "+r" (inptr4), + [inptr8] "+r" (inptr8), + [inptr12] "+r" (inptr12) + : [tile_M] "r" (tile_M), + [tile_N] "r" (tile_N), + [n_channels] "r" (output_shape.n_channels), + [col_stride] "r" (col_stride), + [row_stride] "r" (row_stride), + [row_plus_col_stride] "r" (row_stride + col_stride), + [mstride1] "r" (mstride * sizeof(float)), + [mstride2] "r" (2 * mstride * sizeof(float)), + [mstride3] "r" (3 * mstride * sizeof(float)), + [mrowpad] "r" ((matrix_row_stride - output_shape.n_channels) * sizeof(float)) + : "x19", "x20", "x21", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15", "q16", "q17", "q18", "q19", "q20", "q21", + "q22", "q23", "q24", "q25", "q26", "q27", + "cc", "memory" + ); +} + +template <> +template +inline void Winograd2x2_3x3GemmOutput::_execute( + const Tensor4DShape &output_shape, + float *output, + const float *input, + const int mstride, + const int matrix_row_stride +) { + // Compute basic information about the shape of the matrices + const int tile_M = output_shape.n_rows / 2; + const int tile_N = output_shape.n_cols / 2; + const int n_channels = output_shape.n_channels; + + // Extract 16 input pointers + const float* inptr[16]; + for (int i = 0; i < 16; i++) { + inptr[i] = input + i*mstride; + } + + // Extract 4 output pointers + float *outptr00 = output; + float *outptr01 = outptr00 + n_channels; + float *outptr10 = outptr00 + output_shape.n_cols * n_channels; + float *outptr11 = outptr10 + n_channels; + + // Progress over the output tiles, generating output values. + for (int batch = 0; batch < output_shape.n_batches; batch++) { + for (int tile_i = 0; tile_i < tile_M; tile_i++) { + for (int tile_j = 0; tile_j < tile_N; tile_j++) { + for (int channel = 0; channel < n_channels; channel++) { + // Read values from the input pointers + float F[4][4]; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + F[i][j] = *(inptr[i*4 + j]++); + } + } + + // Compute the matrix F.Z + float ZF[4][2]; + ZF[0][0] = F[0][0] + F[0][1] + F[0][2]; + ZF[0][1] = F[0][1] - F[0][2] - F[0][3]; + ZF[1][0] = F[1][0] + F[1][1] + F[1][2]; + ZF[1][1] = F[1][1] - F[1][2] - F[1][3]; + ZF[2][0] = F[2][0] + F[2][1] + F[2][2]; + ZF[2][1] = F[2][1] - F[2][2] - F[2][3]; + ZF[3][0] = F[3][0] + F[3][1] + F[3][2]; + ZF[3][1] = F[3][1] - F[3][2] - F[3][3]; + + // Hence compute the output matrix Z^T . (F.Z) + *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0]; + *(outptr01++) = ZF[0][1] + ZF[1][1] + ZF[2][1]; + *(outptr10++) = ZF[1][0] - ZF[2][0] - ZF[3][0]; + *(outptr11++) = ZF[1][1] - ZF[2][1] - ZF[3][1]; + } + + // Progress the input pointers to the next row + for (int i = 0; i < 16; i++) { + inptr[i] += matrix_row_stride - n_channels; + } + + // Progress the output pointers to the next column + outptr00 += n_channels; + outptr01 += n_channels; + outptr10 += n_channels; + outptr11 += n_channels; + } + + if (tail_N) { + // Only evaluate the left-most columns of the output + for (int channel = 0; channel < n_channels; channel++) { + // Read values from the input pointers + float F[4][3]; + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 3; j++) { + F[i][j] = *(inptr[i*4 + j]++); + } + } + for (int i = 0; i < 4; i++) { + inptr[i*4 + 3]++; + } + + // Compute the matrix F.Z + float ZF[4][1]; + ZF[0][0] = F[0][0] + F[0][1] + F[0][2]; + ZF[1][0] = F[1][0] + F[1][1] + F[1][2]; + ZF[2][0] = F[2][0] + F[2][1] + F[2][2]; + ZF[3][0] = F[3][0] + F[3][1] + F[3][2]; + + // Hence compute the output matrix Z^T . (F.Z) + *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0]; + *(outptr10++) = ZF[1][0] - ZF[2][0] - ZF[3][0]; + } + + // Progress the input pointers to the next row + for (int i = 0; i < 16; i++) { + inptr[i] += matrix_row_stride - n_channels; + } + + // Progress the output pointers to the next column + outptr01 += n_channels; // Account for being skipped above + outptr11 += n_channels; // Account for being skipped above + } + + // Progress the output pointers to the next row + outptr00 += output_shape.n_cols * n_channels; + outptr01 += output_shape.n_cols * n_channels; + outptr10 += output_shape.n_cols * n_channels; + outptr11 += output_shape.n_cols * n_channels; + } + + if (tail_M) { + // Only work on the upper row of the output + for (int tile_j = 0; tile_j < tile_N; tile_j++) { + for (int channel = 0; channel < n_channels; channel++) { + // Read values from the input pointers + float F[3][4]; + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + F[i][j] = *(inptr[i*4 + j]++); + } + } + for (int j = 0; j < 4; j++) { + inptr[12 + j]++; + } + + // Compute the matrix F.Z + float ZF[3][2]; + ZF[0][0] = F[0][0] + F[0][1] + F[0][2]; + ZF[0][1] = F[0][1] - F[0][2] - F[0][3]; + ZF[1][0] = F[1][0] + F[1][1] + F[1][2]; + ZF[1][1] = F[1][1] - F[1][2] - F[1][3]; + ZF[2][0] = F[2][0] + F[2][1] + F[2][2]; + ZF[2][1] = F[2][1] - F[2][2] - F[2][3]; + + // Hence compute the output matrix Z^T . (F.Z) + *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0]; + *(outptr01++) = ZF[0][1] + ZF[1][1] + ZF[2][1]; + } + + // Progress the input pointers to the next row + for (int i = 0; i < 16; i++) { + inptr[i] += matrix_row_stride - n_channels; + } + + // Progress the output pointers to the next column + outptr00 += n_channels; + outptr01 += n_channels; + outptr10 += 2 * n_channels; // Account for being skipped above + outptr11 += 2 * n_channels; // Account for being skipped above + } + + if (tail_N) { + // Only evaluate the upper-left cell of the output + for (int channel = 0; channel < n_channels; channel++) { + // Read values from the input pointers + float F[3][3]; + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + F[i][j] = *(inptr[i*4 + j]); + } + } + for (int i = 0; i < 16; i++) { + inptr[i]++; + } + + // Compute the matrix F.Z + float ZF[3][1]; + ZF[0][0] = F[0][0] + F[0][1] + F[0][2]; + ZF[1][0] = F[1][0] + F[1][1] + F[1][2]; + ZF[2][0] = F[2][0] + F[2][1] + F[2][2]; + + // Hence compute the output matrix Z^T . (F.Z) + *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0]; + } + + // Progress the input pointers to the next row + for (int i = 0; i < 16; i++) { + inptr[i] += matrix_row_stride - n_channels; + } + + // Progress the output pointers to the next column + outptr01 += n_channels; // Account for being skipped above + outptr10 += n_channels; // Account for being skipped above + outptr11 += n_channels; // Account for being skipped above + } + } + } +} + +/*****************************************************************************/ +template <> +inline void Winograd2x2_3x3GemmOutput::execute( + const Tensor4DShape &output_shape, + float* const matrix_base, + const int matrix_stride, + const int matrix_row_stride, + float* const output +) { + // Dispatch to an appropriate implementation based on the shape of the output + // tensor. + if (output_shape.n_rows % 2 && output_shape.n_cols % 2) { + constexpr bool tail_M = true, tail_N = true; + switch (output_shape.n_channels % 4) { + case 0: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 1: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 2: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 3: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + default: + assert(0); + break; + } + } else if (output_shape.n_rows % 2) { + constexpr bool tail_M = true, tail_N = false; + switch (output_shape.n_channels % 4) { + case 0: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 1: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 2: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 3: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + default: + assert(0); + break; + } + } else if (output_shape.n_cols % 2) { + constexpr bool tail_M = false, tail_N = true; + switch (output_shape.n_channels % 4) { + case 0: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 1: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 2: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 3: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + default: + assert(0); + break; + + } + } else { + constexpr bool tail_M = false, tail_N = false; + switch (output_shape.n_channels % 4) { + case 0: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 1: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 2: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + case 3: + _execute(output_shape, output, matrix_base, matrix_stride, matrix_row_stride); + break; + default: + assert(0); + break; + + } + } +} +/*****************************************************************************/ + +} // namespace winograd +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp b/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp new file mode 100644 index 0000000000..f551b12b52 --- /dev/null +++ b/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +/*****************************************************************************/ +// Compute ZF specializations + +template <> +template <> +inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage::compute_zf<0>( + const int n_rows, const int n_channels, + float* output, const float* const input[16] +) { + // Make copies of some variables + int row = n_rows; + float* outptr = output; + const float* inptr = input[0]; + + // Perform the transformation + asm volatile ( + // "inptr0 .req %x[inptr]\n" + "inptr1 .req x0\n" + "inptr2 .req x1\n" + "inptr3 .req x2\n" + "inptr4 .req x3\n" + "inptr5 .req x4\n" + "inptr6 .req x5\n" + "inptr7 .req x6\n" + "inptr8 .req x7\n" + "inptr9 .req x8\n" + "inptr10 .req x9\n" + "inptr11 .req x10\n" + "inptr12 .req x11\n" + "inptr13 .req x12\n" + "inptr14 .req x13\n" + "inptr15 .req x14\n" + + // "outptr0 .req %x[outptr]\n" + "outptr1 .req x15\n" + "outptr2 .req x16\n" + "outptr3 .req x17\n" + "outptr4 .req x18\n" + "outptr5 .req x19\n" + "outptr6 .req x20\n" + "outptr7 .req x21\n" + + // Compute additional pointers into the input and output matrices. + "mstride .req x22\n" // Matrix stride + "mul mstride, %x[row], %x[n_channels]\n" + "lsl mstride, mstride, #2\n" // * sizeof(float) + + "add inptr1, %x[inptr], mstride\n" + "add inptr2, %x[inptr], mstride, LSL #1\n" + "add inptr3, inptr2, mstride\n" + "add inptr4, inptr3, mstride\n" + "add inptr5, inptr4, mstride\n" + "add inptr6, inptr5, mstride\n" + "add inptr7, inptr6, mstride\n" + "add inptr8, inptr7, mstride\n" + "add inptr9, inptr8, mstride\n" + "add inptr10, inptr9, mstride\n" + "add inptr11, inptr10, mstride\n" + "add inptr12, inptr11, mstride\n" + "add inptr13, inptr12, mstride\n" + "add inptr14, inptr13, mstride\n" + "add inptr15, inptr14, mstride\n" + + "add outptr1, %[outptr], mstride\n" + "add outptr2, outptr1, mstride\n" + "add outptr3, outptr2, mstride\n" + "add outptr4, outptr3, mstride\n" + "add outptr5, outptr4, mstride\n" + "add outptr6, outptr5, mstride\n" + "add outptr7, outptr6, mstride\n" + + ".unreq mstride\n" + + "column .req x22\n" // Column loop counter + + "1:" // Loop over rows + "ldr q0, [%x[inptr]], #0x10\n" + "ldr q1, [inptr1], #0x10\n" + "ldr q2, [inptr2], #0x10\n" + "ldr q3, [inptr3], #0x10\n" + "ldr q4, [inptr4], #0x10\n" + "ldr q5, [inptr5], #0x10\n" + "ldr q6, [inptr6], #0x10\n" + "ldr q7, [inptr7], #0x10\n" + "subs column, %x[n_channels], #0x4\n" + "beq 3f\n" + + "2:" // Loop over columns + "ldr q8, [inptr8], #0x10\n" + "prfm pldl1keep, [%x[inptr], #196]\n" + "fadd v16.4s, v0.4s, v1.4s\n" + + "ldr q9, [inptr9], #0x10\n" + "prfm pldl1keep, [inptr1, #196]\n" + "fsub v17.4s, v1.4s, v2.4s\n" + + "ldr q10, [inptr10], #0x10\n" + "prfm pldl1keep, [inptr2, #196]\n" + "fadd v16.4s, v16.4s, v2.4s\n" + + "ldr q11, [inptr11], #0x10\n" + "prfm pldl1keep, [inptr3, #196]\n" + "fsub v17.4s, v17.4s, v3.4s\n" + + "ldr q12, [inptr12], #0x10\n" + "prfm pldl1keep, [inptr4, #196]\n" + "str q16, [%x[outptr]], #0x10\n" + + "ldr q13, [inptr13], #0x10\n" + "prfm pldl1keep, [inptr5, #196]\n" + "str q17, [outptr1], #0x10\n" + + "ldr q14, [inptr14], #0x10\n" + "prfm pldl1keep, [inptr6, #196]\n" + "fadd v16.4s, v4.4s, v5.4s\n" + + "ldr q15, [inptr15], #0x10\n" + "prfm pldl1keep, [inptr7, #196]\n" + "fsub v17.4s, v5.4s, v6.4s\n" + + "ldr q0, [%x[inptr]], #0x10\n" + "prfm pldl1keep, [inptr8, #196]\n" + "fadd v16.4s, v16.4s, v6.4s\n" + + "ldr q1, [inptr1], #0x10\n" + "prfm pldl1keep, [inptr9, #196]\n" + "fsub v17.4s, v17.4s, v7.4s\n" + + "ldr q2, [inptr2], #0x10\n" + "prfm pldl1keep, [inptr10, #196]\n" + "str q16, [outptr2], #0x10\n" + + "ldr q3, [inptr3], #0x10\n" + "prfm pldl1keep, [inptr11, #196]\n" + "str q17, [outptr3], #0x10\n" + + "ldr q4, [inptr4], #0x10\n" + "prfm pldl1keep, [inptr12, #196]\n" + "fadd v16.4s, v8.4s, v9.4s\n" + + "ldr q5, [inptr5], #0x10\n" + "prfm pldl1keep, [inptr13, #196]\n" + "fsub v17.4s, v9.4s, v10.4s\n" + + "ldr q6, [inptr6], #0x10\n" + "prfm pldl1keep, [inptr14, #196]\n" + "fadd v16.4s, v16.4s, v10.4s\n" + + "ldr q7, [inptr7], #0x10\n" + "prfm pldl1keep, [inptr15, #196]\n" + "fsub v17.4s, v17.4s, v11.4s\n" + + "str q16, [outptr4], #0x10\n" + "fadd v16.4s, v12.4s, v13.4s\n" + "fsub v18.4s, v13.4s, v14.4s\n" + + "str q17, [outptr5], #0x10\n" + "fadd v16.4s, v16.4s, v14.4s\n" + "fsub v18.4s, v18.4s, v15.4s\n" + + "str q16, [outptr6], #0x10\n" + "subs column, column, #0x4\n" + + "str q18, [outptr7], #0x10\n" + "bne 2b\n" + + "3:" // Tail + "ldr q8, [inptr8], #0x10\n" + "prfm pldl1keep, [%x[inptr], #196]\n" + "fadd v16.4s, v0.4s, v1.4s\n" + + "ldr q9, [inptr9], #0x10\n" + "prfm pldl1keep, [inptr1, #196]\n" + "fsub v17.4s, v1.4s, v2.4s\n" + + "ldr q10, [inptr10], #0x10\n" + "prfm pldl1keep, [inptr2, #196]\n" + "fadd v16.4s, v16.4s, v2.4s\n" + + "ldr q11, [inptr11], #0x10\n" + "prfm pldl1keep, [inptr3, #196]\n" + "fsub v17.4s, v17.4s, v3.4s\n" + + "ldr q12, [inptr12], #0x10\n" + "prfm pldl1keep, [inptr4, #196]\n" + "str q16, [%x[outptr]], #0x10\n" + + "ldr q13, [inptr13], #0x10\n" + "prfm pldl1keep, [inptr5, #196]\n" + "str q17, [outptr1], #0x10\n" + + "ldr q14, [inptr14], #0x10\n" + "prfm pldl1keep, [inptr6, #196]\n" + "fadd v16.4s, v4.4s, v5.4s\n" + + "ldr q15, [inptr15], #0x10\n" + "prfm pldl1keep, [inptr7, #196]\n" + "fsub v17.4s, v5.4s, v6.4s\n" + + "prfm pldl1keep, [inptr8, #196]\n" + "prfm pldl1keep, [inptr9, #196]\n" + "fadd v16.4s, v16.4s, v6.4s\n" + + "prfm pldl1keep, [inptr10, #196]\n" + "prfm pldl1keep, [inptr11, #196]\n" + "fsub v17.4s, v17.4s, v7.4s\n" + + "prfm pldl1keep, [inptr12, #196]\n" + "prfm pldl1keep, [inptr13, #196]\n" + "str q16, [outptr2], #0x10\n" + + "prfm pldl1keep, [inptr14, #196]\n" + "prfm pldl1keep, [inptr15, #196]\n" + "str q17, [outptr3], #0x10\n" + + "fadd v16.4s, v8.4s, v9.4s\n" + "fsub v17.4s, v9.4s, v10.4s\n" + + "fadd v16.4s, v16.4s, v10.4s\n" + "fsub v17.4s, v17.4s, v11.4s\n" + + "str q16, [outptr4], #0x10\n" + "fadd v16.4s, v12.4s, v13.4s\n" + "fsub v18.4s, v13.4s, v14.4s\n" + + "str q17, [outptr5], #0x10\n" + "fadd v16.4s, v16.4s, v14.4s\n" + "fsub v18.4s, v18.4s, v15.4s\n" + + "str q16, [outptr6], #0x10\n" + "str q18, [outptr7], #0x10\n" + + "subs %x[row], %x[row], #0x1\n" + "bne 1b\n" + + ".unreq inptr1\n" + ".unreq inptr2\n" + ".unreq inptr3\n" + ".unreq inptr4\n" + ".unreq inptr5\n" + ".unreq inptr6\n" + ".unreq inptr7\n" + ".unreq inptr8\n" + ".unreq inptr9\n" + ".unreq inptr10\n" + ".unreq inptr11\n" + ".unreq inptr12\n" + ".unreq inptr13\n" + ".unreq inptr14\n" + ".unreq inptr15\n" + ".unreq outptr1\n" + ".unreq outptr2\n" + ".unreq outptr3\n" + ".unreq outptr4\n" + ".unreq outptr5\n" + ".unreq outptr6\n" + ".unreq outptr7\n" + + : [row] "+r" (row), + [inptr] "+r" (inptr), + [outptr] "+r" (outptr) + : [n_channels] "r" (n_channels), + [sizeof_float] "i" (sizeof(float)) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15", "q16", "q17", "x0", "x1", "x2", "x3", "x4", + "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "cc", "memory" + ); +} + +/*****************************************************************************/ +// Compute ZFZ^T specializations + +template <> +template <> +inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage::compute_zfzT( + const Tensor4DShape &output_shape, + float* const output, const float* const input +) { + const int tile_M = output_shape.n_rows / 2; + const int tile_N = output_shape.n_cols / 2; + int batch = output_shape.n_batches; + float *outptr = output; + const float *inptr = input; + + asm volatile ( + // Compute input pointers + "inptr1 .req x0\n" + "inptr2 .req x1\n" + "inptr3 .req x2\n" + "inptr4 .req x3\n" + "inptr5 .req x4\n" + "inptr6 .req x5\n" + "inptr7 .req x6\n" + "inptr8 .req x7\n" + + "mstride .req x8\n" + "mul mstride, %x[tile_M], %x[tile_N]\n" + "mul mstride, mstride, %x[n_channels]\n" + "lsl mstride, mstride, #2\n" // * sizeof(float) + + "add inptr1, %[inptr], mstride\n" + "add inptr2, inptr1, mstride\n" + "add inptr3, inptr2, mstride\n" + "add inptr4, inptr3, mstride\n" + "add inptr5, inptr4, mstride\n" + "add inptr6, inptr5, mstride\n" + "add inptr7, inptr6, mstride\n" + "add inptr8, inptr7, mstride\n" + + ".unreq mstride\n" + + // Compute initial output pointers + "outptr01 .req x8\n" + "outptr10 .req x9\n" + "outptr11 .req x10\n" + + "add outptr01, %x[outptr], %x[n_channels], LSL #2\n" + "add outptr10, %x[outptr], %x[row_stride], LSL #2\n" + "add outptr11, outptr10, %x[n_channels], LSL #2\n" + + "tile_i .req x11\n" + "tile_j .req x12\n" + "channel .req x13\n" + + "1:" // Loop over batches + "mov tile_i, %x[tile_M]\n" + + "2:" // Loop over rows of output tiles + "mov tile_j, %x[tile_N]\n" + + "3:" // Loop over columns of output tiles + "ldr q0, [%x[inptr]], #0x10\n" + "ldr q2, [inptr2], #0x10\n" + "subs channel, %x[n_channels], #0x4\n" + + "ldr q1, [inptr1], #0x10\n" + "ldr q3, [inptr3], #0x10\n" + "beq 6f\n" + + "4:" + "ldr q4, [inptr4], #0x10\n" + "ldr q5, [inptr5], #0x10\n" + "fadd v16.4s, v0.4s, v2.4s\n" + + "ldr q6, [inptr6], #0x10\n" + "ldr q7, [inptr7], #0x10\n" + "fadd v17.4s, v1.4s, v3.4s\n" + + "ldr q8, [%x[inptr]], #0x10\n" + "ldr q10, [inptr2], #0x10\n" + "fadd v16.4s, v16.4s, v4.4s\n" + + "ldr q9, [inptr1], #0x10\n" + "ldr q11, [inptr3], #0x10\n" + "fadd v17.4s, v17.4s, v5.4s\n" + + "str q16, [%x[outptr]], #0x10\n" + "prfm pldl1strm, [%x[inptr], #196]\n" + "fsub v18.4s, v2.4s, v4.4s\n" + + "str q17, [outptr01], #0x10\n" + "prfm pldl1strm, [inptr2, #196]\n" + "fsub v19.4s, v3.4s, v5.4s\n" + + "prfm pldl1strm, [inptr1, #196]\n" + "prfm pldl1strm, [inptr3, #196]\n" + "fsub v18.4s, v18.4s, v6.4s\n" + + "prfm pldl1strm, [inptr4, #196]\n" + "prfm pldl1strm, [inptr5, #196]\n" + "fsub v19.4s, v19.4s, v7.4s\n" + + "str q18, [outptr10], #0x10\n" + "prfm pldl1strm, [inptr6, #196]\n" + "prfm pldl1strm, [inptr7, #196]\n" + + "subs channel, channel, #0x4\n" + + "str q19, [outptr11], #0x10\n" + "beq 6f\n" // Branch to tail + + "ldr q12, [inptr4], #0x10\n" + "ldr q13, [inptr5], #0x10\n" + "fadd v16.4s, v8.4s, v10.4s\n" + + "ldr q14, [inptr6], #0x10\n" + "ldr q15, [inptr7], #0x10\n" + "fadd v17.4s, v9.4s, v11.4s\n" + + "ldr q0, [%x[inptr]], #0x10\n" + "ldr q2, [inptr2], #0x10\n" + "fadd v16.4s, v16.4s, v12.4s\n" + + "ldr q1, [inptr1], #0x10\n" + "ldr q3, [inptr3], #0x10\n" + "fadd v17.4s, v17.4s, v13.4s\n" + + "str q16, [%x[outptr]], #0x10\n" + "prfm pldl1strm, [%x[inptr], #196]\n" + "fsub v18.4s, v10.4s, v12.4s\n" + + "str q17, [outptr01], #0x10\n" + "prfm pldl1strm, [inptr2, #196]\n" + "fsub v19.4s, v11.4s, v13.4s\n" + + "prfm pldl1strm, [inptr1, #196]\n" + "prfm pldl1strm, [inptr3, #196]\n" + "fsub v18.4s, v18.4s, v14.4s\n" + + "prfm pldl1strm, [inptr4, #196]\n" + "prfm pldl1strm, [inptr5, #196]\n" + "fsub v19.4s, v19.4s, v15.4s\n" + + "str q18, [outptr10], #0x10\n" + "prfm pldl1strm, [inptr6, #196]\n" + "prfm pldl1strm, [inptr7, #196]\n" + + "subs channel, channel, #0x4\n" + + "str q19, [outptr11], #0x10\n" + "bne 4b\n" // Continue loop + + "5:" // Tail + "ldr q12, [inptr4], #0x10\n" + "ldr q13, [inptr5], #0x10\n" + "fadd v16.4s, v8.4s, v10.4s\n" + + "ldr q14, [inptr6], #0x10\n" + "ldr q15, [inptr7], #0x10\n" + "fadd v17.4s, v9.4s, v11.4s\n" + + "fadd v16.4s, v16.4s, v12.4s\n" + + "fadd v17.4s, v17.4s, v13.4s\n" + + "str q16, [%x[outptr]], #0x10\n" + "fsub v18.4s, v10.4s, v12.4s\n" + "fsub v19.4s, v11.4s, v13.4s\n" + + "str q17, [outptr01], #0x10\n" + "fsub v18.4s, v18.4s, v14.4s\n" + "fsub v19.4s, v19.4s, v15.4s\n" + + "str q18, [outptr10], #0x10\n" + "str q19, [outptr11], #0x10\n" + "b 7f\n" + + "6:" // Tail + "ldr q4, [inptr4], #0x10\n" + "ldr q5, [inptr5], #0x10\n" + "fadd v16.4s, v0.4s, v2.4s\n" + + "ldr q6, [inptr6], #0x10\n" + "ldr q7, [inptr7], #0x10\n" + "fadd v17.4s, v1.4s, v3.4s\n" + + "fadd v16.4s, v16.4s, v4.4s\n" + + "fadd v17.4s, v17.4s, v5.4s\n" + + "str q16, [%x[outptr]], #0x10\n" + "fsub v18.4s, v2.4s, v4.4s\n" + "fsub v19.4s, v3.4s, v5.4s\n" + + "str q17, [outptr01], #0x10\n" + "fsub v18.4s, v18.4s, v6.4s\n" + "fsub v19.4s, v19.4s, v7.4s\n" + + "str q18, [outptr10], #0x10\n" + "str q19, [outptr11], #0x10\n" + + "7:" + "add %x[outptr], %x[outptr], %x[n_channels], LSL #2\n" + "add outptr01, outptr01, %x[n_channels], LSL #2\n" + "add outptr10, outptr10, %x[n_channels], LSL #2\n" + "add outptr11, outptr11, %x[n_channels], LSL #2\n" + + "subs tile_j, tile_j, #1\n" + "bne 3b\n" + + // Progress the output pointers to the new row + "add %x[outptr], %x[outptr], %x[row_stride], LSL #2\n" + "add outptr01, outptr01, %x[row_stride], LSL #2\n" + "add outptr10, outptr10, %x[row_stride], LSL #2\n" + "add outptr11, outptr11, %x[row_stride], LSL #2\n" + + "subs tile_i, tile_i, #1\n" + "bne 2b\n" + + "subs %[batch], %[batch], #1\n" + "bne 1b\n" + "5:" + + ".unreq inptr1\n" + ".unreq inptr2\n" + ".unreq inptr3\n" + ".unreq inptr4\n" + ".unreq inptr5\n" + ".unreq inptr6\n" + ".unreq inptr7\n" + ".unreq inptr8\n" + ".unreq outptr01\n" + ".unreq outptr10\n" + ".unreq outptr11\n" + : [batch] "+r" (batch), + [outptr] "+r" (outptr), + [inptr] "+r" (inptr) + : [tile_M] "r" (tile_M), + [tile_N] "r" (tile_N), + [n_channels] "r" (output_shape.n_channels), + [row_stride] "r" (output_shape.n_cols * output_shape.n_channels) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", + "x12", "x13", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "cc", "memory" + ); +} +/*****************************************************************************/ + +/*****************************************************************************/ +template <> +inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage::execute( + const Tensor4DShape &output_shape, + float* const matrices[16], float* const output +) { + // profiler prof; + + // Allocate memory for the intermediate matrices + const int tile_M = iceildiv(output_shape.n_rows, 2); + const int tile_N = iceildiv(output_shape.n_cols, 2); + const int n_rows = output_shape.n_batches * tile_M * tile_N; + const int n_channels = output_shape.n_channels; + float* matrices_zf = reinterpret_cast( + calloc(8 * n_rows * n_channels, sizeof(float)) + ); + + // Perform the first stage transform, computing ZF. + const auto f_compute_zf = [&] () { + switch (n_channels % 4) { + case 0: + compute_zf<0>(n_rows, n_channels, matrices_zf, matrices); + break; + case 1: + compute_zf<1>(n_rows, n_channels, matrices_zf, matrices); + break; + case 2: + compute_zf<2>(n_rows, n_channels, matrices_zf, matrices); + break; + case 3: + compute_zf<3>(n_rows, n_channels, matrices_zf, matrices); + }; + }; + // prof("Compute ZF", f_compute_zf, 16 * n_rows * n_channels * sizeof(float), 0, 8 * n_rows * n_channels * sizeof(float)); + f_compute_zf(); + + // Perform the second stage transform, finishing Z F Z^T - variable dispatch + // based on size of the output and the channel tail. + const auto f_compute_zfzT = [&] () { + if (output_shape.n_rows % 2 && output_shape.n_cols % 2) { + constexpr bool tail_M = true, tail_N = true; + switch (n_channels % 4) { + case 0: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 1: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 2: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 3: + compute_zfzT(output_shape, output, matrices_zf); + } + } else if (output_shape.n_rows % 2) { + constexpr bool tail_M = true, tail_N = false; + switch (n_channels % 4) { + case 0: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 1: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 2: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 3: + compute_zfzT(output_shape, output, matrices_zf); + } + } else if (output_shape.n_cols % 2) { + constexpr bool tail_M = false, tail_N = true; + switch (n_channels % 4) { + case 0: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 1: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 2: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 3: + compute_zfzT(output_shape, output, matrices_zf); + } + } else { + constexpr bool tail_M = false, tail_N = false; + switch (n_channels % 4) { + case 0: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 1: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 2: + compute_zfzT(output_shape, output, matrices_zf); + break; + case 3: + compute_zfzT(output_shape, output, matrices_zf); + } + } + }; + // prof("Compute ZFZT", f_compute_zfzT, 8 * n_rows * n_channels * sizeof(float), 0, 4 * n_rows * n_channels * sizeof(float)); + f_compute_zfzT(); + + free(reinterpret_cast(matrices_zf)); +} +/*****************************************************************************/ + +#endif // __aarch64__ -- cgit v1.2.1