From d6ca478a7e410f8f529c2e505305b46d9fe21a9b Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Tue, 23 Jan 2018 09:36:04 +0000 Subject: COMPMID-784: Added support for biases in WinogradLayer. 1) Updated to the latest code from the RSH repo. 2) Moved winograd transforms into kernels. 3) Added support for biases Change-Id: I7f39f34a599b49d7d9b549cc10a4f4d4a8007ab8 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117474 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../winograd/transforms/output_2x2_3x3_fp32.cpp | 25 ++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp') diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp index e7907d18c0..58db7d2ecd 100644 --- a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp +++ b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp @@ -65,6 +65,7 @@ void Transform::process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, + const float* const biases, float* const output, const int output_row_stride, const int output_col_stride @@ -83,6 +84,7 @@ void Transform::process_tile( } } const float *inptr = matrix_base; + const float *bptr = biases; // For each channel of the output int channels_remaining = n_channels; @@ -90,7 +92,7 @@ void Transform::process_tile( for (; channels_remaining >= 4; channels_remaining -= 4) { // Matrices used and computed during this transform - float32x4_t F[4][4], FZ[4][2], f[2][2]; + float32x4_t F[4][4], FZ[4][2], f[2][2], b; // Read a 4x4 tile in the Winograd domain for (int i = 0, m = 0; i < 4; i++) @@ -122,12 +124,16 @@ void Transform::process_tile( f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); } + // Load the bias vector + b = vld1q_f32(bptr); + bptr += 4; + // Write out the output tile for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) { - vst1q_f32(outptrs[i][j], f[i][j]); + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); outptrs[i][j] += 4; } } @@ -137,7 +143,7 @@ void Transform::process_tile( for (; channels_remaining >= 2; channels_remaining -= 2) { // Matrices used and computed during this transform - float32x2_t F[4][4], FZ[4][2], f[2][2]; + float32x2_t F[4][4], FZ[4][2], f[2][2], b; // Read a 4x4 tile in the Winograd domain for (int i = 0, m = 0; i < 4; i++) @@ -169,12 +175,16 @@ void Transform::process_tile( f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); } + // Load the bias vector + b = vld1_f32(bptr); + bptr += 2; + // Write out the output tile for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) { - vst1_f32(outptrs[i][j], f[i][j]); + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); outptrs[i][j] += 2; } } @@ -183,7 +193,7 @@ void Transform::process_tile( for (; channels_remaining; channels_remaining--) { // Matrices used and computed during this transform - float F[4][4], FZ[4][2], f[2][2]; + float F[4][4], FZ[4][2], f[2][2], b; // Read a 4x4 tile in the Winograd domain for (int i = 0, m = 0; i < 4; i++) @@ -209,12 +219,15 @@ void Transform::process_tile( f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; } + // Load the bias + b = *(bptr++); + // Write out the output tile for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) { - *(outptrs[i][j]++) = f[i][j]; + *(outptrs[i][j]++) = f[i][j] + b; } } } -- cgit v1.2.1