aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvolutionLayer.h5
-rw-r--r--src/core/NEON/kernels/NEWinogradLayerKernel.cpp3
-rw-r--r--src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp344
-rw-r--r--src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp339
-rw-r--r--src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp406
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp13
-rw-r--r--src/runtime/NEON/functions/NEWinogradLayer.cpp10
-rw-r--r--tests/validation/NEON/ConvolutionLayer.cpp43
-rw-r--r--tests/validation/NEON/DilatedConvolutionLayer.cpp30
-rw-r--r--tests/validation/fixtures/WinogradLayerFixture.h13
10 files changed, 831 insertions, 375 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index c67951a7ee..220d1cb249 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -92,8 +92,6 @@ public:
* while every optional dimension from 4 and above represent a batch of inputs.
* Data types supported: QS8/QASYMM8/QS16/F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
* @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
* Data types supported: Same as @p input.
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
@@ -104,9 +102,8 @@ public:
*
* @return the Convolution Method Hint
*/
- static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
// Inherited methods overridden:
void run() override;
diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
index b2e44f8e09..fcd1594601 100644
--- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
@@ -299,12 +299,11 @@ void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, Ker
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_NULLPTR(_biases->buffer());
ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
- reinterpret_cast<T *>(_biases->buffer()), _output,
+ (_biases ? reinterpret_cast<T *>(_biases->buffer()) : nullptr), _output,
_n_batches, _n_rows, _n_cols, _n_channels);
// The code below cannot be moved to configure because biases hasn't been allocated at that point
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index a95ce0e7d2..3b3cda0aa9 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
@@ -86,148 +86,288 @@ void Transform::process_tile(
const float *inptr = matrix_base;
const float *bptr = biases;
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
+ if (bptr)
{
- // Matrices used and computed during this transform
- float32x4_t F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 4; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr += 4;
+ inptr += 4;
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
- }
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+ }
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
- // Load the bias vector
- b = vld1q_f32(bptr);
- bptr += 4;
+ // Load the bias vector
+ b = vld1q_f32(bptr);
+ bptr += 4;
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
{
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
}
}
- }
#endif // __aarch64__
#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- for (int j = 0; j < 4; j++, m++)
+ // Matrices used and computed during this transform
+ float32x2_t F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr += 2;
+ inptr += 2;
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
- // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
- }
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+ }
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
- // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
- f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
- }
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
- // Load the bias vector
- b = vld1_f32(bptr);
- bptr += 2;
+ // Load the bias vector
+ b = vld1_f32(bptr);
+ bptr += 2;
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
{
- for (int j = 0; j < cells_j; j++)
+ // Matrices used and computed during this transform
+ float F[4][4], FZ[4][2], f[2][2], b;
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ }
+
+ // Load the bias
+ b = *(bptr++);
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
}
}
}
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
+ else
{
- // Matrices used and computed during this transform
- float F[4][4], FZ[4][2], f[2][2], b;
-
- // Read a 4x4 tile in the Winograd domain
- for (int i = 0, m = 0; i < 4; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 4; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[4][4], FZ[4][2], f[2][2];
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
{
- F[i][j] = *(inptr + m*matrix_stride);
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr++;
+ inptr += 4;
- // Compute the matrix F Z
- for (int i = 0; i < 4; i++)
- {
- FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
- FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
- }
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
- f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 4;
+ }
+ }
}
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[4][4], FZ[4][2], f[2][2];
- // Load the bias
- b = *(bptr++);
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
+ {
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
- // Write out the output tile
- for (int i = 0; i < cells_i; i++)
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ // FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+ // FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+ // f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
{
- for (int j = 0; j < cells_j; j++)
+ // Matrices used and computed during this transform
+ float F[4][4], FZ[4][2], f[2][2];
+
+ // Read a 4x4 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 4; i++)
+ {
+ for (int j = 0; j < 4; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 4; i++)
+ {
+ FZ[i][0] = F[i][0] + F[i][1] + F[i][2];
+ FZ[i][1] = F[i][1] - F[i][2] - F[i][3];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = FZ[0][j] + FZ[1][j] + FZ[2][j];
+ f[1][j] = FZ[1][j] - FZ[2][j] - FZ[3][j];
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
{
- *(outptrs[i][j]++) = f[i][j] + b;
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j];
+ }
}
}
}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
index 262f71118c..cafce9549d 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
@@ -35,6 +35,7 @@ template <>
template <>
int Transform::ops_performed(const Tensor4DShape &shape)
{
+ (void) shape;
return 0; // TODO
}
@@ -83,142 +84,282 @@ void Transform::process_tile(
const float *inptr = matrix_base;
const float *bptr = biases;
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
+ if (bptr)
{
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr += 4;
+ inptr += 4;
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
- // Write out the output tile
- b = vld1q_f32(bptr);
- bptr += 4;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Write out the output tile
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ for (int i = 0; i < cells_i; i++)
{
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
}
}
- }
#endif // __aarch64__
#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr += 2;
+ inptr += 2;
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
- }
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
- f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
- }
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
- // Write out the output tile
- b = vld1_f32(bptr);
- bptr += 2;
- for (int i = 0; i < cells_i; i++)
+ // Write out the output tile
+ b = vld1_f32(bptr);
+ bptr += 2;
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
{
- for (int j = 0; j < cells_j; j++)
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][2], f[2][2], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ b = *(bptr++);
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
}
}
}
-#endif // __arm_any__
- for (; channels_remaining; channels_remaining--)
+ else
{
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][2], f[2][2], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][2], f[2][2];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = *(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr++;
+ inptr += 4;
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
- }
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 2; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 4;
+ }
+ }
}
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][2], f[2][2];
- // Write out the output tile
- b = *(bptr++);
- for (int i = 0; i < cells_i; i++)
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif // __arm_any__
+ for (; channels_remaining; channels_remaining--)
{
- for (int j = 0; j < cells_j; j++)
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][2], f[2][2];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 2; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
{
- *(outptrs[i][j]++) = f[i][j] + b;
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j];
+ }
}
}
}
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index 609823b9e1..cd3bdef0d2 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
@@ -100,170 +100,338 @@ void Transform::process_tile(
const float *inptr = matrix_base;
const float *bptr = biases;
- // For each channel of the output
- int channels_remaining = n_channels;
-#ifdef __aarch64__
- for (; channels_remaining >= 4; channels_remaining -= 4)
+ if (bptr)
{
- // Matrices used and computed during this transform
- float32x4_t F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][4], f[4][4], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr += 4;
+ inptr += 4;
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
- // Write out the output tile
- b = vld1q_f32(bptr);
- bptr += 4;
- for (int i = 0; i < cells_i; i++)
- {
- for (int j = 0; j < cells_j; j++)
+ // Write out the output tile
+ b = vld1q_f32(bptr);
+ bptr += 4;
+ for (int i = 0; i < cells_i; i++)
{
- vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
- outptrs[i][j] += 4;
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+ outptrs[i][j] += 4;
+ }
}
}
- }
#endif // __aarch64__
#ifdef __arm_any__
- for (; channels_remaining >= 2; channels_remaining -= 2)
- {
- // Matrices used and computed during this transform
- float32x2_t F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ for (; channels_remaining >= 2; channels_remaining -= 2)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][4], f[4][4], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr += 2;
+ inptr += 2;
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
- // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
- // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
- // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
- }
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
- // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
- // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
- // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
- f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
- }
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
- // Write out the output tile
- b = vld1_f32(bptr);
- bptr += 2;
- for (int i = 0; i < cells_i; i++)
+ // Write out the output tile
+ b = vld1_f32(bptr);
+ bptr += 2;
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif
+ for (; channels_remaining; channels_remaining--)
{
- for (int j = 0; j < cells_j; j++)
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][4], f[4][4], b;
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
{
- vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
- outptrs[i][j] += 2;
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ b = *(bptr++);
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j] + b;
+ }
}
}
}
-#endif
- for (; channels_remaining; channels_remaining--)
+ else
{
- // Matrices used and computed during this transform
- float F[6][6], FZ[6][4], f[4][4], b;
-
- // Read a 6x6 tile in the Winograd domain
- for (int i = 0, m = 0; i < 6; i++)
+ // For each channel of the output
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
{
- for (int j = 0; j < 6; j++, m++)
+ // Matrices used and computed during this transform
+ float32x4_t F[6][6], FZ[6][4], f[4][4];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
{
- F[i][j] = *(inptr + m*matrix_stride);
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+ }
}
- }
- inptr++;
+ inptr += 4;
- // Compute the matrix F Z
- for (int i = 0; i < 6; i++)
- {
- FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
- FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
- FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
- FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
- }
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
- // Compute the output tile f = ZT F Z
- for (int j = 0; j < 4; j++)
- {
- f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
- f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
- f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
- f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1q_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 4;
+ }
+ }
}
+#endif // __aarch64__
+#ifdef __arm_any__
+ for (; channels_remaining >= 2; channels_remaining -= 2)
+ {
+ // Matrices used and computed during this transform
+ float32x2_t F[6][6], FZ[6][4], f[4][4];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = vld1_f32(inptr + m*matrix_stride);
+ }
+ }
+ inptr += 2;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ // FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+ // FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
- // Write out the output tile
- b = *(bptr++);
- for (int i = 0; i < cells_i; i++)
+ // FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+ // FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ // f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+ // f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+ // f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+ // f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
+ {
+ for (int j = 0; j < cells_j; j++)
+ {
+ vst1_f32(outptrs[i][j], f[i][j]);
+ outptrs[i][j] += 2;
+ }
+ }
+ }
+#endif
+ for (; channels_remaining; channels_remaining--)
{
- for (int j = 0; j < cells_j; j++)
+ // Matrices used and computed during this transform
+ float F[6][6], FZ[6][4], f[4][4];
+
+ // Read a 6x6 tile in the Winograd domain
+ for (int i = 0, m = 0; i < 6; i++)
+ {
+ for (int j = 0; j < 6; j++, m++)
+ {
+ F[i][j] = *(inptr + m*matrix_stride);
+ }
+ }
+ inptr++;
+
+ // Compute the matrix F Z
+ for (int i = 0; i < 6; i++)
+ {
+ FZ[i][0] = 1*F[i][0] + 1*F[i][1] + 1*F[i][2] + 1*F[i][3] + 1*F[i][4];
+ FZ[i][1] = 1*F[i][1] + -1*F[i][2] + 2*F[i][3] + -2*F[i][4];
+ FZ[i][2] = 1*F[i][1] + 1*F[i][2] + 4*F[i][3] + 4*F[i][4];
+ FZ[i][3] = 1*F[i][1] + -1*F[i][2] + 8*F[i][3] + -8*F[i][4] + 1*F[i][5];
+ }
+
+ // Compute the output tile f = ZT F Z
+ for (int j = 0; j < 4; j++)
+ {
+ f[0][j] = 1*FZ[0][j] + 1*FZ[1][j] + 1*FZ[2][j] + 1*FZ[3][j] + 1*FZ[4][j];
+ f[1][j] = 1*FZ[1][j] + -1*FZ[2][j] + 2*FZ[3][j] + -2*FZ[4][j];
+ f[2][j] = 1*FZ[1][j] + 1*FZ[2][j] + 4*FZ[3][j] + 4*FZ[4][j];
+ f[3][j] = 1*FZ[1][j] + -1*FZ[2][j] + 8*FZ[3][j] + -8*FZ[4][j] + 1*FZ[5][j];
+ }
+
+ // Write out the output tile
+ for (int i = 0; i < cells_i; i++)
{
- *(outptrs[i][j]++) = f[i][j] + b;
+ for (int j = 0; j < cells_j; j++)
+ {
+ *(outptrs[i][j]++) = f[i][j];
+ }
}
}
}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index badeb07405..f248821de6 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -47,8 +47,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info));
- switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
- weights_info, dilation, act_info))
+ switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
{
@@ -80,7 +79,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
{
- switch(NEConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, dilation, act_info))
+ switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
//Validate Winograd
@@ -101,15 +100,19 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
return Status{};
}
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_UNUSED(weights_info);
ARM_COMPUTE_UNUSED(act_info);
if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1)
- && (conv_info.stride().second == 1) && (biases != nullptr) && (dilation == Size2D(1U, 1U)))
+ && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)))
{
return ConvolutionMethod::WINOGRAD;
}
diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
index f82845c7ad..126be46b2e 100644
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp
@@ -52,7 +52,7 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported");
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -83,9 +83,9 @@ NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_UNUSED(conv_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), biases->info(), output->info(), conv_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
_weights = weights;
_input = input;
@@ -260,8 +260,8 @@ void NEWinogradLayer::run()
Status NEWinogradLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, weights, biases, output, conv_info));
return Status{};
}
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 27216af6d1..3a365253cb 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -76,22 +76,17 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
TEST_SUITE(NEON)
TEST_SUITE(ConvolutionLayer)
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
- }),
- framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
- })),
- framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(1U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(16U), 1, DataType::F32, 0)
- })),
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
+ }),
+ framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
+ })),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(6U, 6U, 1U), 1, DataType::F32, 0),
TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
@@ -103,11 +98,10 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
PadStrideInfo(3, 2, 1, 0)
})),
framework::dataset::make("Expected", { ConvolutionMethod::WINOGRAD, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
- input_info, weights_info, biases_info, output_info, conv_info, expected)
+ input_info, weights_info, output_info, conv_info, expected)
{
ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
&weights_info.clone()->set_is_resizable(false),
- &biases_info.clone()->set_is_resizable(false),
&output_info.clone()->set_is_resizable(false), conv_info);
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
}
@@ -117,6 +111,9 @@ TEST_SUITE(WinogradLayer)
template <typename T>
using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
+template <typename T>
+using NEWinogradConvolutionLayerNoBiasFixture = WinogradConvolutionLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T, false>;
+
TEST_SUITE(FP32)
FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
@@ -128,6 +125,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, frame
validate(Accessor(_target), _reference, tolerance_f32);
}
+FIXTURE_DATA_TEST_CASE(RunSmallNoBias, NEWinogradConvolutionLayerNoBiasFixture<float>, framework::DatasetMode::PRECOMMIT,
+ combine(combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+ datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+ framework::dataset::make("DataType", { DataType::F32 })),
+ ActivationFunctionsDataset))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_f32);
+}
+
TEST_SUITE_END()
TEST_SUITE_END()
diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
index 1e8c19fc5e..358cec3d6f 100644
--- a/tests/validation/NEON/DilatedConvolutionLayer.cpp
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp
@@ -66,22 +66,17 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
TEST_SUITE(NEON)
TEST_SUITE(DilatedConvolutionLayer)
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
- }),
- framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
- })),
- framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(1U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(16U), 1, DataType::F32, 0)
- })),
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(8U, 8U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
+ }),
+ framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
+ })),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(6U, 6U, 1U), 1, DataType::F32, 0),
TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32, 0),
@@ -98,11 +93,10 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
Size2D(3U, 3U)
})),
framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
- input_info, weights_info, biases_info, output_info, conv_info, dilation, expected)
+ input_info, weights_info, output_info, conv_info, dilation, expected)
{
ConvolutionMethod is_valid = NEConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
&weights_info.clone()->set_is_resizable(false),
- &biases_info.clone()->set_is_resizable(false),
&output_info.clone()->set_is_resizable(false),
conv_info, WeightsInfo(), dilation);
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/WinogradLayerFixture.h
index 5210cbf720..481eb93e80 100644
--- a/tests/validation/fixtures/WinogradLayerFixture.h
+++ b/tests/validation/fixtures/WinogradLayerFixture.h
@@ -48,7 +48,7 @@ namespace validation
{
using namespace arm_compute::misc::shape_calculator;
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool use_bias = true>
class WinogradConvolutionLayerValidationFixture : public framework::Fixture
{
public:
@@ -93,7 +93,7 @@ protected:
// Create and configure function
FunctionType conv;
- conv.configure(&src, &weights, &bias, &dst, info, act_info);
+ conv.configure(&src, &weights, (use_bias) ? &bias : nullptr, &dst, info, act_info);
ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -133,7 +133,14 @@ protected:
// Fill reference
fill(src, 0, -1.f, 1.f);
fill(weights, 1, -1.f, 1.f);
- fill(bias, 2, -1.f, 1.f);
+ if(use_bias)
+ {
+ fill(bias, 2, -1.f, 1.f);
+ }
+ else
+ {
+ fill(bias, 2, 0.f, 0.f);
+ }
return (act_info.enabled()) ? reference::activation_layer<T>(reference::convolution_layer<T>(src, weights, bias, output_shape, info), act_info) : reference::convolution_layer<T>(src, weights, bias,
output_shape, info);