From e65790294158a650ed8ca708eb7a503f9849a97f Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Tue, 20 Aug 2019 17:25:25 +0100 Subject: COMPMID-2596 MobilenetSSD produce wrong results Update GEMM assembly code. Change-Id: Id315c51a11aa89915727c4d388e9335982216a2d Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/1774 Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/NEON/kernels/arm_gemm/quantized.cpp | 37 +++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp index a51ed3f82a..28f01bd252 100644 --- a/src/core/NEON/kernels/arm_gemm/quantized.cpp +++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp @@ -435,8 +435,17 @@ template void requantize_block_32(const ARequantizeLayer32 &qp, unsigned int wid * However, beyond this point we always use signed values in both cases. * The instructions that need to be different are therefore wrapped in * helper functions below. + * + * The general strategy used is to load vectors of 16 bytes and accumulate + * (using uadalp/sadalp or AArch32 equivalents) into 8x16-bit accumulators. + * These are then reduced (using uadalp/sadalp again) into 4x32-bit + * accumulators. The 4 accumulators for up to 4 rows being processed are + * then added together into a single output vector using pairwise adds. + * + * This reduction from the 8x16-bit into the 4x32-bit accumulators needs to + * occur before the 16-bit accumulators can overflow - which is every 32 + * iterations (512 total bytes processed). This is explained more below. */ - namespace { struct row_sum_helpers { const ARequantizeLayer32 &qp; @@ -463,11 +472,33 @@ namespace { int32x4_t finalsums[rows]; for (unsigned int i=0; i 0 && ((i & 31) == 0)) { + finalsums[r] = vpadalq_s16(finalsums[r], sums[r]); + sums[r] = vdupq_n_s16(0); + } sums[r] = accumulate_16(input + (r * in_stride) + (i * 16), sums[r]); } } @@ -484,7 +515,7 @@ namespace { } for (unsigned int i=0; i