aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2019-08-20 17:25:25 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2019-08-21 17:05:47 +0100
commite65790294158a650ed8ca708eb7a503f9849a97f (patch)
tree2e928efb78f845760860848468f4f076745f18c9
parent7b5aa651217946a6098cb0116c98ce41edfdfd30 (diff)
downloadComputeLibrary-e65790294158a650ed8ca708eb7a503f9849a97f.tar.gz
COMPMID-2596 MobilenetSSD produce wrong results
Update GEMM assembly code. Change-Id: Id315c51a11aa89915727c4d388e9335982216a2d Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/1774 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantized.cpp37
1 files changed, 34 insertions, 3 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index a51ed3f82a..28f01bd252 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -435,8 +435,17 @@ template void requantize_block_32(const ARequantizeLayer32 &qp, unsigned int wid
* However, beyond this point we always use signed values in both cases.
* The instructions that need to be different are therefore wrapped in
* helper functions below.
+ *
+ * The general strategy used is to load vectors of 16 bytes and accumulate
+ * (using uadalp/sadalp or AArch32 equivalents) into 8x16-bit accumulators.
+ * These are then reduced (using uadalp/sadalp again) into 4x32-bit
+ * accumulators. The 4 accumulators for up to 4 rows being processed are
+ * then added together into a single output vector using pairwise adds.
+ *
+ * This reduction from the 8x16-bit into the 4x32-bit accumulators needs to
+ * occur before the 16-bit accumulators can overflow - which is every 32
+ * iterations (512 total bytes processed). This is explained more below.
*/
-
namespace {
struct row_sum_helpers {
const ARequantizeLayer32 &qp;
@@ -463,11 +472,33 @@ namespace {
int32x4_t finalsums[rows];
for (unsigned int i=0; i<rows; i++) {
- sums[i] = vdupq_n_s16(0);
+ sums[i] = vdupq_n_s16(0);
+ finalsums[i] = vdupq_n_s32(0);
}
for (unsigned int i=0; i<blocks; i++) {
for (unsigned int r=0; r<rows; r++) {
+ /* If we add too many blocks together, we run the risk
+ * of overflowing the intermediate 16-bit accumulators,
+ * especially in the unsigned case where we later treat
+ * the accumulator as signed.
+ *
+ * In that case, the maximum (signed) value is 16383,
+ * which is safe for 64 (unsigned) accumulations (255*64
+ * = 16,320).
+ *
+ * Each invocation of pairwise add adds 2 values to the
+ * accumulator - so in the unsigned case we can do 32
+ * adds before we need to reset the 16-bit accumulator
+ * by adding into the 32-bit 'finalsums'.
+ *
+ * We could do 64 adds in the signed case, but that
+ * optimization is not worth the complexity.
+ */
+ if (i > 0 && ((i & 31) == 0)) {
+ finalsums[r] = vpadalq_s16(finalsums[r], sums[r]);
+ sums[r] = vdupq_n_s16(0);
+ }
sums[r] = accumulate_16(input + (r * in_stride) + (i * 16), sums[r]);
}
}
@@ -484,7 +515,7 @@ namespace {
}
for (unsigned int i=0; i<rows; i++) {
- finalsums[i] = vpaddlq_s16(sums[i]);
+ finalsums[i] = vpadalq_s16(finalsums[i], sums[i]);
}
int32x4_t t0, t1;