diff options
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/quantized.cpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/quantized.cpp | 23 |
1 files changed, 10 insertions, 13 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp index 28f01bd252..bffb7ddcb3 100644 --- a/src/core/NEON/kernels/arm_gemm/quantized.cpp +++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp @@ -495,10 +495,10 @@ namespace { * We could do 64 adds in the signed case, but that * optimization is not worth the complexity. */ - if (i > 0 && ((i & 31) == 0)) { - finalsums[r] = vpadalq_s16(finalsums[r], sums[r]); - sums[r] = vdupq_n_s16(0); - } + if (i > 0 && ((i & 31) == 0)) { + finalsums[r] = vpadalq_s16(finalsums[r], sums[r]); + sums[r] = vdupq_n_s16(0); + } sums[r] = accumulate_16(input + (r * in_stride) + (i * 16), sums[r]); } } @@ -526,6 +526,7 @@ namespace { * that the terms can simply be added in the requantize code. * */ switch (rows) { + default: case 1: /* If we only have one output, just use ADDV. Multiply * the offset into all four components separately so it @@ -567,8 +568,6 @@ namespace { vst1q_s32(row_bias, t0); break; - default: - break; } } @@ -736,12 +735,11 @@ inline void add_block(const int8_t *input, unsigned int in_stride, int32_t *outp } } - /* "first_col" parameter is used to offset the read into the qp.bias array, * in cases where we are not computing the first columns of the output (i.e. * in multithreaded cases where we divide columns across threads) */ template<typename T> -void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int first_col) { +void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col) { memset(reinterpret_cast<void *>(col_bias), 0, width * sizeof(int32_t)); for (unsigned int row=0; row<height; row+=4) { @@ -752,6 +750,7 @@ void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned if (numcols==16) { switch(numrows) { + default: case 1: add_block<1>(input + row * in_stride + col, in_stride, col_bias + col); break; @@ -767,8 +766,6 @@ void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned case 4: add_block<4>(input + row * in_stride + col, in_stride, col_bias + col); break; - default: - break; } } else { for (; col<width; col++) { @@ -788,15 +785,15 @@ void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned result = (qp.a_offset * qp.b_offset * depth) - (result * qp.a_offset); if (qp.bias != nullptr) { - result += qp.bias[col + first_col]; + result += qp.bias[multi * qp.bias_multi_stride + col + first_col]; } col_bias[col] = result; } } -template void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int first_col); -template void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int first_col); +template void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col); +template void compute_col_sums(const ARequantizeLayer32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col); } // namespace arm_gemm |