diff options
author | Pablo Tello <pablo.tello@arm.com> | 2018-01-23 09:36:04 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:45:00 +0000 |
commit | d6ca478a7e410f8f529c2e505305b46d9fe21a9b (patch) | |
tree | 5c50c06e07f812890f127b1c4933996987f74f17 /src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp | |
parent | d05dce46a14a7b67f322328ecd95bf96bdd30bae (diff) | |
download | ComputeLibrary-d6ca478a7e410f8f529c2e505305b46d9fe21a9b.tar.gz |
COMPMID-784: Added support for biases in WinogradLayer.
1) Updated to the latest code from the RSH repo.
2) Moved winograd transforms into kernels.
3) Added support for biases
Change-Id: I7f39f34a599b49d7d9b549cc10a4f4d4a8007ab8
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117474
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp')
-rw-r--r-- | src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp | 25 |
1 files changed, 19 insertions, 6 deletions
diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp index e7907d18c0..58db7d2ecd 100644 --- a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp +++ b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp @@ -65,6 +65,7 @@ void Transform::process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, + const float* const biases, float* const output, const int output_row_stride, const int output_col_stride @@ -83,6 +84,7 @@ void Transform::process_tile( } } const float *inptr = matrix_base; + const float *bptr = biases; // For each channel of the output int channels_remaining = n_channels; @@ -90,7 +92,7 @@ void Transform::process_tile( for (; channels_remaining >= 4; channels_remaining -= 4) { // Matrices used and computed during this transform - float32x4_t F[4][4], FZ[4][2], f[2][2]; + float32x4_t F[4][4], FZ[4][2], f[2][2], b; // Read a 4x4 tile in the Winograd domain for (int i = 0, m = 0; i < 4; i++) @@ -122,12 +124,16 @@ void Transform::process_tile( f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); } + // Load the bias vector + b = vld1q_f32(bptr); + bptr += 4; + // Write out the output tile for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) { - vst1q_f32(outptrs[i][j], f[i][j]); + vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b)); outptrs[i][j] += 4; } } @@ -137,7 +143,7 @@ void Transform::process_tile( for (; channels_remaining >= 2; channels_remaining -= 2) { // Matrices used and computed during this transform - float32x2_t F[4][4], FZ[4][2], f[2][2]; + float32x2_t F[4][4], FZ[4][2], f[2][2], b; // Read a 4x4 tile in the Winograd domain for (int i = 0, m = 0; i < 4; i++) @@ -169,12 +175,16 @@ void Transform::process_tile( f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]); } + // Load the bias vector + b = vld1_f32(bptr); + bptr += 2; + // Write out the output tile for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) { - vst1_f32(outptrs[i][j], f[i][j]); + vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b)); outptrs[i][j] += 2; } } @@ -183,7 +193,7 @@ void Transform::process_tile( for (; channels_remaining; channels_remaining--) { // Matrices used and computed during this transform - float F[4][4], FZ[4][2], f[2][2]; + float F[4][4], FZ[4][2], f[2][2], b; // Read a 4x4 tile in the Winograd domain for (int i = 0, m = 0; i < 4; i++) @@ -209,12 +219,15 @@ void Transform::process_tile( f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j]; } + // Load the bias + b = *(bptr++); + // Write out the output tile for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) { - *(outptrs[i][j]++) = f[i][j]; + *(outptrs[i][j]++) = f[i][j] + b; } } } |