From 4d9379a9d3ada794f532ce8acdc8607f4faa2b21 Mon Sep 17 00:00:00 2001 From: Andrew Mundy Date: Thu, 15 Mar 2018 16:47:03 +0000 Subject: COMPMID-1040: Added support for nullptr bias tensor in NEWinogradLayer Change-Id: Ie624ee17c63dede711d913a82819e128954a57c9 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124861 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../runtime/NEON/functions/NEConvolutionLayer.h | 5 +- src/core/NEON/kernels/NEWinogradLayerKernel.cpp | 3 +- .../winograd/transforms/output_2x2_3x3_fp32.cpp | 344 +++++++++++------ .../winograd/transforms/output_2x2_5x5_fp32.cpp | 339 ++++++++++++----- .../winograd/transforms/output_4x4_3x3_fp32.cpp | 406 +++++++++++++++------ src/runtime/NEON/functions/NEConvolutionLayer.cpp | 13 +- src/runtime/NEON/functions/NEWinogradLayer.cpp | 10 +- tests/validation/NEON/ConvolutionLayer.cpp | 43 ++- tests/validation/NEON/DilatedConvolutionLayer.cpp | 30 +- tests/validation/fixtures/WinogradLayerFixture.h | 13 +- 10 files changed, 831 insertions(+), 375 deletions(-) diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index c67951a7ee..220d1cb249 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -92,8 +92,6 @@ public: * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QS8/QASYMM8/QS16/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. @@ -104,9 +102,8 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, + static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo()); - // Inherited methods overridden: void run() override; diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp index b2e44f8e09..fcd1594601 100644 --- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp @@ -299,12 +299,11 @@ void NEWinogradLayerTransformOutputKernelbuffer()); ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace); ARM_COMPUTE_ERROR_ON_NULLPTR(_output); OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, - reinterpret_cast(_biases->buffer()), _output, + (_biases ? reinterpret_cast(_biases->buffer()) : nullptr), _output, _n_batches, _n_rows, _n_cols, _n_channels); // The code below cannot be moved to configure because biases hasn't been allocated at that point diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp index a95ce0e7d2..3b3cda0aa9 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp @@ -86,148 +86,288 @@ void Transform::process_tile( const float *inptr = matrix_base; const float *bptr = biases; - // For each channel of the output - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) + if (bptr) { - // Matrices used and computed during this transform - float32x4_t F[4][4], FZ[4][2], f[2][2], b; - - // Read a 4x4 tile in the Winograd domain - for (int i = 0, m = 0; i < 4; i++) + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) { - for (int j = 0; j < 4; j++, m++) + // Matrices used and computed during this transform + float32x4_t F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) { - F[i][j] = vld1q_f32(inptr + m*matrix_stride); + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } } - } - inptr += 4; + inptr += 4; - // Compute the matrix F Z - for (int i = 0; i < 4; i++) - { - // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; - FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]); + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]); - // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; - FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]); - } + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]); + } - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; - f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); - // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; - f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); - } + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } - // Load the bias vector - b = vld1q_f32(bptr); - bptr += 4; + // Load the bias vector + b = vld1q_f32(bptr); + bptr += 4; - // Write out the output tile - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) + // Write out the output tile + for (int i = 0; i < cells_i; i++) { - vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); - outptrs[i][j] += 4; + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } } } - } #endif // __aarch64__ #ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed during this transform - float32x2_t F[4][4], FZ[4][2], f[2][2], b; - - // Read a 4x4 tile in the Winograd domain - for (int i = 0, m = 0; i < 4; i++) + for (; channels_remaining >= 2; channels_remaining -= 2) { - for (int j = 0; j < 4; j++, m++) + // Matrices used and computed during this transform + float32x2_t F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) { - F[i][j] = vld1_f32(inptr + m*matrix_stride); + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } } - } - inptr += 2; + inptr += 2; - // Compute the matrix F Z - for (int i = 0; i < 4; i++) - { - // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; - FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]); + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]); - // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; - FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]); - } + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]); + } - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; - f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); - // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; - f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); - } + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } - // Load the bias vector - b = vld1_f32(bptr); - bptr += 2; + // Load the bias vector + b = vld1_f32(bptr); + bptr += 2; - // Write out the output tile - for (int i = 0; i < cells_i; i++) + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) { - for (int j = 0; j < cells_j; j++) + // Matrices used and computed during this transform + float F[4][4], FZ[4][2], f[2][2], b; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) { - vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); - outptrs[i][j] += 2; + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + } + + // Load the bias + b = *(bptr++); + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } } } } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) + else { - // Matrices used and computed during this transform - float F[4][4], FZ[4][2], f[2][2], b; - - // Read a 4x4 tile in the Winograd domain - for (int i = 0, m = 0; i < 4; i++) + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) { - for (int j = 0; j < 4; j++, m++) + // Matrices used and computed during this transform + float32x4_t F[4][4], FZ[4][2], f[2][2]; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) { - F[i][j] = *(inptr + m*matrix_stride); + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } } - } - inptr++; + inptr += 4; - // Compute the matrix F Z - for (int i = 0; i < 4; i++) - { - FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; - FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; - } + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]); - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; - f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], f[i][j]); + outptrs[i][j] += 4; + } + } } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[4][4], FZ[4][2], f[2][2]; - // Load the bias - b = *(bptr++); + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; - // Write out the output tile - for (int i = 0; i < cells_i; i++) + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + // FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]); + + // FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]); + + // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], f[i][j]); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) { - for (int j = 0; j < cells_j; j++) + // Matrices used and computed during this transform + float F[4][4], FZ[4][2], f[2][2]; + + // Read a 4x4 tile in the Winograd domain + for (int i = 0, m = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 4; i++) + { + FZ[i][0] = F[i][0] + F[i][1] + F[i][2]; + FZ[i][1] = F[i][1] - F[i][2] - F[i][3]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j]; + f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) { - *(outptrs[i][j]++) = f[i][j] + b; + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j]; + } } } } diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp index 262f71118c..cafce9549d 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp @@ -35,6 +35,7 @@ template <> template <> int Transform::ops_performed(const Tensor4DShape &shape) { + (void) shape; return 0; // TODO } @@ -83,142 +84,282 @@ void Transform::process_tile( const float *inptr = matrix_base; const float *bptr = biases; - // For each channel of the output - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) + if (bptr) { - // Matrices used and computed during this transform - float32x4_t F[6][6], FZ[6][2], f[2][2], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) { - for (int j = 0; j < 6; j++, m++) + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) { - F[i][j] = vld1q_f32(inptr + m*matrix_stride); + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } } - } - inptr += 4; + inptr += 4; - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; - FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); - } + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; - f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); - } + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } - // Write out the output tile - b = vld1q_f32(bptr); - bptr += 4; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) + // Write out the output tile + b = vld1q_f32(bptr); + bptr += 4; + for (int i = 0; i < cells_i; i++) { - vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); - outptrs[i][j] += 4; + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } } } - } #endif // __aarch64__ #ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed during this transform - float32x2_t F[6][6], FZ[6][2], f[2][2], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) + for (; channels_remaining >= 2; channels_remaining -= 2) { - for (int j = 0; j < 6; j++, m++) + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) { - F[i][j] = vld1_f32(inptr + m*matrix_stride); + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } } - } - inptr += 2; + inptr += 2; - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; - FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); - } + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; - f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); - } + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } - // Write out the output tile - b = vld1_f32(bptr); - bptr += 2; - for (int i = 0; i < cells_i; i++) + // Write out the output tile + b = vld1_f32(bptr); + bptr += 2; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) { - for (int j = 0; j < cells_j; j++) + // Matrices used and computed during this transform + float F[6][6], FZ[6][2], f[2][2], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) { - vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); - outptrs[i][j] += 2; + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + b = *(bptr++); + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } } } } -#endif // __arm_any__ - for (; channels_remaining; channels_remaining--) + else { - // Matrices used and computed during this transform - float F[6][6], FZ[6][2], f[2][2], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) { - for (int j = 0; j < 6; j++, m++) + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][2], f[2][2]; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) { - F[i][j] = *(inptr + m*matrix_stride); + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } } - } - inptr++; + inptr += 4; - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; - } + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); - // Compute the output tile f = ZT F Z - for (int j = 0; j < 2; j++) - { - f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], f[i][j]); + outptrs[i][j] += 4; + } + } } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][2], f[2][2]; - // Write out the output tile - b = *(bptr++); - for (int i = 0; i < cells_i; i++) + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]); + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], f[i][j]); + outptrs[i][j] += 2; + } + } + } +#endif // __arm_any__ + for (; channels_remaining; channels_remaining--) { - for (int j = 0; j < cells_j; j++) + // Matrices used and computed during this transform + float F[6][6], FZ[6][2], f[2][2]; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 2; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) { - *(outptrs[i][j]++) = f[i][j] + b; + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j]; + } } } } diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp index 609823b9e1..cd3bdef0d2 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp @@ -100,170 +100,338 @@ void Transform::process_tile( const float *inptr = matrix_base; const float *bptr = biases; - // For each channel of the output - int channels_remaining = n_channels; -#ifdef __aarch64__ - for (; channels_remaining >= 4; channels_remaining -= 4) + if (bptr) { - // Matrices used and computed during this transform - float32x4_t F[6][6], FZ[6][4], f[4][4], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) { - for (int j = 0; j < 6; j++, m++) + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) { - F[i][j] = vld1q_f32(inptr + m*matrix_stride); + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } } - } - inptr += 4; + inptr += 4; - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; - FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f); + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f); - // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; - FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f); + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f); - // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; - FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); - } + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } - // Compute the output tile f = ZT F Z - for (int j = 0; j < 4; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; - f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f); + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f); - // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; - f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f); + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f); - // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; - f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); - } + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } - // Write out the output tile - b = vld1q_f32(bptr); - bptr += 4; - for (int i = 0; i < cells_i; i++) - { - for (int j = 0; j < cells_j; j++) + // Write out the output tile + b = vld1q_f32(bptr); + bptr += 4; + for (int i = 0; i < cells_i; i++) { - vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); - outptrs[i][j] += 4; + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); + outptrs[i][j] += 4; + } } } - } #endif // __aarch64__ #ifdef __arm_any__ - for (; channels_remaining >= 2; channels_remaining -= 2) - { - // Matrices used and computed during this transform - float32x2_t F[6][6], FZ[6][4], f[4][4], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) + for (; channels_remaining >= 2; channels_remaining -= 2) { - for (int j = 0; j < 6; j++, m++) + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) { - F[i][j] = vld1_f32(inptr + m*matrix_stride); + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } } - } - inptr += 2; + inptr += 2; - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); - // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; - FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f); + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f); - // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; - FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f); + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f); - // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; - FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); - } + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } - // Compute the output tile f = ZT F Z - for (int j = 0; j < 4; j++) - { - // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); - // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; - f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f); + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f); - // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; - f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f); + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f); - // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; - f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); - } + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } - // Write out the output tile - b = vld1_f32(bptr); - bptr += 2; - for (int i = 0; i < cells_i; i++) + // Write out the output tile + b = vld1_f32(bptr); + bptr += 2; + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); + outptrs[i][j] += 2; + } + } + } +#endif + for (; channels_remaining; channels_remaining--) { - for (int j = 0; j < cells_j; j++) + // Matrices used and computed during this transform + float F[6][6], FZ[6][4], f[4][4], b; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) { - vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); - outptrs[i][j] += 2; + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + b = *(bptr++); + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j] + b; + } } } } -#endif - for (; channels_remaining; channels_remaining--) + else { - // Matrices used and computed during this transform - float F[6][6], FZ[6][4], f[4][4], b; - - // Read a 6x6 tile in the Winograd domain - for (int i = 0, m = 0; i < 6; i++) + // For each channel of the output + int channels_remaining = n_channels; +#ifdef __aarch64__ + for (; channels_remaining >= 4; channels_remaining -= 4) { - for (int j = 0; j < 6; j++, m++) + // Matrices used and computed during this transform + float32x4_t F[6][6], FZ[6][4], f[4][4]; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) { - F[i][j] = *(inptr + m*matrix_stride); + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1q_f32(inptr + m*matrix_stride); + } } - } - inptr++; + inptr += 4; - // Compute the matrix F Z - for (int i = 0; i < 6; i++) - { - FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; - FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; - FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; - FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; - } + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]); - // Compute the output tile f = ZT F Z - for (int j = 0; j < 4; j++) - { - f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; - f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; - f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; - f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f); + + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f); + + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f); + + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f); + + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1q_f32(outptrs[i][j], f[i][j]); + outptrs[i][j] += 4; + } + } } +#endif // __aarch64__ +#ifdef __arm_any__ + for (; channels_remaining >= 2; channels_remaining -= 2) + { + // Matrices used and computed during this transform + float32x2_t F[6][6], FZ[6][4], f[4][4]; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = vld1_f32(inptr + m*matrix_stride); + } + } + inptr += 2; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]); + + // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f); - // Write out the output tile - b = *(bptr++); - for (int i = 0; i < cells_i; i++) + // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f); + + // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]); + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]); + + // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f); + + // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f); + + // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]); + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) + { + for (int j = 0; j < cells_j; j++) + { + vst1_f32(outptrs[i][j], f[i][j]); + outptrs[i][j] += 2; + } + } + } +#endif + for (; channels_remaining; channels_remaining--) { - for (int j = 0; j < cells_j; j++) + // Matrices used and computed during this transform + float F[6][6], FZ[6][4], f[4][4]; + + // Read a 6x6 tile in the Winograd domain + for (int i = 0, m = 0; i < 6; i++) + { + for (int j = 0; j < 6; j++, m++) + { + F[i][j] = *(inptr + m*matrix_stride); + } + } + inptr++; + + // Compute the matrix F Z + for (int i = 0; i < 6; i++) + { + FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4]; + FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4]; + FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4]; + FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5]; + } + + // Compute the output tile f = ZT F Z + for (int j = 0; j < 4; j++) + { + f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j]; + f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j]; + f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j]; + f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j]; + } + + // Write out the output tile + for (int i = 0; i < cells_i; i++) { - *(outptrs[i][j]++) = f[i][j] + b; + for (int j = 0; j < cells_j; j++) + { + *(outptrs[i][j]++) = f[i][j]; + } } } } diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index badeb07405..f248821de6 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -47,8 +47,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info)); - switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, - weights_info, dilation, act_info)) + switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info)) { case ConvolutionMethod::WINOGRAD: { @@ -80,7 +79,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info) { - switch(NEConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, dilation, act_info)) + switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info)) { case ConvolutionMethod::WINOGRAD: //Validate Winograd @@ -101,15 +100,19 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, +ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info) { + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights); ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_UNUSED(weights_info); ARM_COMPUTE_UNUSED(act_info); if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1) - && (conv_info.stride().second == 1) && (biases != nullptr) && (dilation == Size2D(1U, 1U))) + && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U))) { return ConvolutionMethod::WINOGRAD; } diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp index f82845c7ad..126be46b2e 100644 --- a/src/runtime/NEON/functions/NEWinogradLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp @@ -52,7 +52,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported"); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); @@ -83,9 +83,9 @@ NEWinogradLayer::NEWinogradLayer(std::shared_ptr memory_manager) void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(conv_info); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), biases->info(), output->info(), conv_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info)); _weights = weights; _input = input; @@ -260,8 +260,8 @@ void NEWinogradLayer::run() Status NEWinogradLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, weights, biases, output, conv_info)); return Status{}; } diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 27216af6d1..3a365253cb 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -76,22 +76,17 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo TEST_SUITE(NEON) TEST_SUITE(ConvolutionLayer) -DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( - framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0), - TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0), - TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0) - }), - framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0) - })), - framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(1U), 1, DataType::F32, 0), - TensorInfo(TensorShape(21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(16U), 1, DataType::F32, 0) - })), +DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0), + TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0), + TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0) + }), + framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0) + })), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(6U, 6U, 1U), 1, DataType::F32, 0), TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0), TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0), @@ -103,11 +98,10 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z PadStrideInfo(3, 2, 1, 0) })), framework::dataset::make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })), - input_info, weights_info, biases_info, output_info, conv_info, expected) + input_info, weights_info, output_info, conv_info, expected) { ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), - &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } @@ -117,6 +111,9 @@ TEST_SUITE(WinogradLayer) template using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture; +template +using NEWinogradConvolutionLayerNoBiasFixture = WinogradConvolutionLayerValidationFixture; + TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(), @@ -128,6 +125,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture, frame validate(Accessor(_target), _reference, tolerance_f32); } +FIXTURE_DATA_TEST_CASE(RunSmallNoBias, NEWinogradConvolutionLayerNoBiasFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(), + datasets::SmallWinogradConvolutionLayer5x5Dataset()), + framework::dataset::make("DataType", { DataType::F32 })), + ActivationFunctionsDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_f32); +} + TEST_SUITE_END() TEST_SUITE_END() diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp index 1e8c19fc5e..358cec3d6f 100644 --- a/tests/validation/NEON/DilatedConvolutionLayer.cpp +++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp @@ -66,22 +66,17 @@ const auto CNNDataTypes = framework::dataset::make("DataType", TEST_SUITE(NEON) TEST_SUITE(DilatedConvolutionLayer) -DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip( - framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0), - TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0), - TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0) - }), - framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0) - })), - framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(1U), 1, DataType::F32, 0), - TensorInfo(TensorShape(21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(21U), 1, DataType::F32, 0), - TensorInfo(TensorShape(16U), 1, DataType::F32, 0) - })), +DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0), + TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0), + TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0) + }), + framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0), + TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0) + })), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(6U, 6U, 1U), 1, DataType::F32, 0), TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0), TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0), @@ -98,11 +93,10 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z Size2D(3U, 3U) })), framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })), - input_info, weights_info, biases_info, output_info, conv_info, dilation, expected) + input_info, weights_info, output_info, conv_info, dilation, expected) { ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), - &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), dilation); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/WinogradLayerFixture.h index 5210cbf720..481eb93e80 100644 --- a/tests/validation/fixtures/WinogradLayerFixture.h +++ b/tests/validation/fixtures/WinogradLayerFixture.h @@ -48,7 +48,7 @@ namespace validation { using namespace arm_compute::misc::shape_calculator; -template +template class WinogradConvolutionLayerValidationFixture : public framework::Fixture { public: @@ -93,7 +93,7 @@ protected: // Create and configure function FunctionType conv; - conv.configure(&src, &weights, &bias, &dst, info, act_info); + conv.configure(&src, &weights, (use_bias) ? &bias : nullptr, &dst, info, act_info); ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS); @@ -133,7 +133,14 @@ protected: // Fill reference fill(src, 0, -1.f, 1.f); fill(weights, 1, -1.f, 1.f); - fill(bias, 2, -1.f, 1.f); + if(use_bias) + { + fill(bias, 2, -1.f, 1.f); + } + else + { + fill(bias, 2, 0.f, 0.f); + } return (act_info.enabled()) ? reference::activation_layer(reference::convolution_layer(src, weights, bias, output_shape, info), act_info) : reference::convolution_layer(src, weights, bias, output_shape, info); -- cgit v1.2.1