aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-10-16 19:21:40 +0100
committerMichele Di Giorgio <michele.digiorgio@arm.com>2019-11-06 15:53:24 +0000
commitdbdea0d1c025b18d4d82c278c87454427918f5b4 (patch)
tree68bc25452f5d5b41006fb507c41516446cf8e457
parent75d47330e7ca0325cf5d83711452f6aeb085998f (diff)
downloadComputeLibrary-dbdea0d1c025b18d4d82c278c87454427918f5b4.tar.gz
COMPMID-2308: NEConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters
Change-Id: Ic1bf5f0d21ccd525f84213a360f7e199d7f50577 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/2177 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/NEAsymm.h99
-rw-r--r--arm_compute/core/NEON/NEKernels.h1
-rw-r--r--arm_compute/core/NEON/NEMath.h11
-rw-r--r--arm_compute/core/NEON/NEMath.inl8
-rw-r--r--arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h76
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h8
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h10
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h29
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h6
-rw-r--r--arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h4
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/add.h82
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/eor.h56
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h2
-rw-r--r--arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h43
-rw-r--r--arm_compute/core/NEON/wrapper/traits.h16
-rw-r--r--arm_compute/core/QuantizationInfo.h30
-rw-r--r--arm_compute/core/Types.h13
-rw-r--r--arm_compute/core/Utils.h5
-rw-r--r--arm_compute/core/utils/quantization/AsymmHelpers.h15
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h14
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h4
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h51
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h6
-rw-r--r--src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp136
-rw-r--r--src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp5
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp438
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp275
-rw-r--r--src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEWeightsReshapeKernel.cpp2
-rw-r--r--src/core/Utils.cpp4
-rw-r--r--src/core/utils/quantization/AsymmHelpers.cpp38
-rw-r--r--src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp16
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp88
-rw-r--r--src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp7
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp148
-rw-r--r--tests/AssetsLibrary.h1
-rw-r--r--tests/Utils.h1
-rw-r--r--tests/validation/Helpers.cpp18
-rw-r--r--tests/validation/Helpers.h18
-rw-r--r--tests/validation/NEON/ConvolutionLayer.cpp37
-rw-r--r--tests/validation/fixtures/ConvolutionLayerFixture.h77
-rw-r--r--tests/validation/reference/Convolution3d.h51
-rw-r--r--tests/validation/reference/ConvolutionLayer.cpp20
-rw-r--r--tests/validation/reference/ConvolutionLayer.h4
45 files changed, 1592 insertions, 387 deletions
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index 56d4c09f9..a3bd7e28f 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -115,6 +115,66 @@ uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
return out_u8;
}
+/** Performs final quantization step on 16 elements for symmetric quantization
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param in_s32 Input to be quantized.
+ * @param result_fixedpoint_multiplier Result multiplier parameter
+ * @param result_shift Result shift parameter
+ * @param result_offset_after_shift_s32 Result offset parameter
+ * @param min_s8 Relu lower bound
+ * @param max_s8 Relu upper bound
+ *
+ * @return Quantized values
+ */
+template <bool is_bounded_relu>
+inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
+ const int32x4x4_t &result_fixedpoint_multiplier,
+ const int32x4x4_t &result_shift,
+ const int32x4_t &result_offset_after_shift_s32,
+ const int8x16_t &min_s8,
+ const int8x16_t &max_s8)
+{
+ // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+ in_s32.val[0] = vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]);
+ in_s32.val[1] = vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]);
+ in_s32.val[2] = vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]);
+ in_s32.val[3] = vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]);
+
+ // Round to the nearest division by a power-of-two using result_shift_s32
+ in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift.val[0]);
+ in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift.val[1]);
+ in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift.val[2]);
+ in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift.val[3]);
+
+ // Add the offset terms
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+ // Convert S32 to S16
+ const int16x8x2_t in_s16 =
+ {
+ {
+ vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+ }
+ };
+
+ // Convert S16 to S8
+ int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+ if(is_bounded_relu)
+ {
+ out_s8 = vmaxq_s8(out_s8, min_s8);
+ out_s8 = vminq_s8(out_s8, max_s8);
+ }
+
+ return out_s8;
+}
+
/** Performs final quantization step on single element
*
* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
@@ -154,6 +214,45 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
return out_u8;
}
+/** Performs final quantization step on single element
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param[in] in_value Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier Result multiplier parameter
+ * @param[in] result_shift Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8 Relu lower bound
+ * @param[in] max_s8 Relu upper bound
+ *
+ * @return Quantized value
+ */
+template <bool is_bounded_relu>
+inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
+ int32_t result_shift, int32_t result_offset_after_shift_s32,
+ int8_t min_s8, int8_t max_s8)
+{
+ int32x4_t in_s32 = vdupq_n_s32(in_value);
+
+ // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+ in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
+
+ // Shift value by result_shift_s32
+ in_value = rounding_divide_by_pow2(in_value, result_shift);
+
+ // Add the offset term
+ in_value += result_offset_after_shift_s32;
+
+ // Bound the result
+ int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+ if(is_bounded_relu)
+ {
+ out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
+ }
+
+ return out_s8;
+}
+
/** Dequantize a neon vector holding 8 quantized values.
*
* @param[in] qv Input values to be dequantized.
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 33a640fa0..aa46a346e 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -46,6 +46,7 @@
#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
#include "arm_compute/core/NEON/kernels/NECropKernel.h"
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
index 560abd6cd..8593059b1 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h
@@ -129,6 +129,17 @@ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
* @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
*
* @param[in] x Vector of 4 elements
+ * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent);
+
+/** Round to the nearest division by a power-of-two using exponent
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x Vector of 4 elements
* @param[in] exponent Integer value used to round to nearest division by a power-of-two
*
* @return the nearest division by a power-of-two using exponent
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 61315e8db..f1c9c2024 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -294,6 +294,14 @@ inline float32x2_t vsin_f32(float32x2_t val)
#endif /* DOXYGEN_SKIP_THIS */
+inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent)
+{
+ const int32x4_t shift_vec = vnegq_s32(exponent);
+ const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+ const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+ return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
{
const int32x4_t shift_vec = vdupq_n_s32(-exponent);
diff --git a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
new file mode 100644
index 000000000..d3f090727
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__
+#define __ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** NEON kernel to convert asymmetric signed to asymmetric signed and vice-versa */
+class NEConvertQuantizedSignednessKernel : public INEKernel
+{
+public:
+ const char *name() const override
+ {
+ return "NEConvertQuantizedSignednessKernel";
+ }
+ /** Default constructor */
+ NEConvertQuantizedSignednessKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers). */
+ NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
+ /** Initialize the kernel's input, output.
+ *
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
+ * @param[out] output Destination tensor. Data types supported: opposite of @p input.
+ */
+ void configure(const ITensor *input, ITensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
+ *
+ * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
+ * @param[in] output Destination tensor. Data types supported: opposite of @p input.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ const ITensor *_input;
+ ITensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 5c0104d13..f47012684 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,13 +60,13 @@ public:
NEGEMMInterleave4x4Kernel();
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: All
* @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
*/
void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input Input tensor info. Data types supported: All
* @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
*
* @return a status
@@ -79,7 +79,7 @@ public:
private:
/** Common signature for all the transpose functions
*
- * @param[in] input An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input An input tensor. Data types supported: All
* @param[out] output The output tensor. Data type supported: same as @p input
* @param[in] window Region on which to execute the kernel.
*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index 354ae21e2..6467a8dd0 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,15 +62,15 @@ public:
* The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
* kernels change the layout of the original matrices to be more cache-friendly.
*
- * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+ * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+ * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
*/
void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
*
- * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8
- * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0
+ * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+ * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
* @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
*
* @return a status
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 4eab86d00..ce3dddbe1 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -47,7 +47,7 @@ public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data type supported: QASYMM8
+ * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
* @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
* @param[in] k Number of matrix A columns (or matrix B rows)
* @param[in] is_reshaped True if the input tensor has been reshaped
@@ -75,7 +75,7 @@ public:
}
/** Initialise the kernel's input and output.
*
- * @param[in] mtx_a Input tensor. Data type supported: QASYMM8
+ * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
* @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
* @param[in] num_mtx_a_cols Number of matrix A columns
* @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4
@@ -83,7 +83,7 @@ public:
void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override;
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
*
- * @param[in] mtx_a Input tensor. Data type supported: QASYMM8
+ * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
* @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
* @param[in] num_mtx_a_cols Number of matrix A columns
* @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4
@@ -94,6 +94,14 @@ public:
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ /** Execution of the reduction kernel specialized on the input type
+ *
+ * @param[in] window Execution window
+ */
+ template <typename T>
+ void run_internal(const Window &window);
};
/** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
@@ -110,7 +118,7 @@ public:
}
/** Initialise the kernel's input and output.
*
- * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
* @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
* @param[in] num_mtx_b_rows Number of matrix B rows
* @param[in] is_transposed1xW True if the input tensor is transposed 1xW
@@ -118,7 +126,7 @@ public:
void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override;
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
*
- * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
* @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
* @param[in] num_mtx_b_rows Number of matrix B rows
* @param[in] is_transposed1xW True if the input tensor is transposed 1xW
@@ -129,6 +137,15 @@ public:
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ /** Execution of the reduction kernel specialized on the input type
+ *
+ * @param[in] window Execution window
+ * @param[in] info Thread-related information
+ */
+ template <typename T>
+ void run_internal(const Window &window, const ThreadInfo &info);
};
} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index b7fbfcfcd..54086d1c2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -74,13 +74,13 @@ public:
}
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: same as @p input.
*/
void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
*
- * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor info. Data type supported: same as @p input.
*
* @return a status
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index bba18a8fa..585c707bb 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -75,7 +75,7 @@ public:
/** Set the input and output of the kernel.
*
* @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32
+ * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/FP16/F32
* @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
* dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
* @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -85,7 +85,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
*
* @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F16/F32
+ * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32
* @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
* dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
* @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h
index 4f4d24448..183917048 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/add.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/add.h
@@ -63,13 +63,13 @@ VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#undef VADD_IMPL
+// VQADD: Vector saturating add (No notion of saturation for floating point)
#define VQADD_IMPL(stype, vtype, prefix, postfix) \
inline vtype vqadd(const vtype &a, const vtype &b) \
{ \
return prefix##_##postfix(a, b); \
}
-// VQADD: Vector saturating add (No notion of saturation for floating point)
VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8)
VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8)
VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16)
@@ -96,6 +96,86 @@ VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32)
VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#undef VQADD_IMPL
+
+// VADDW: Vector widening add
+#define VADDW_IMPL(wtype, vtype, prefix, postfix) \
+ inline wtype vaddw(const wtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8)
+VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8)
+VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16)
+VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16)
+VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32)
+VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32)
+#undef VADDW_IMPL
+
+// VADDL: Vector long add
+#define VADDL_IMPL(wtype, vtype, prefix, postfix) \
+ inline wtype vaddl(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8)
+VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8)
+VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16)
+VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16)
+VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32)
+VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32)
+#undef VADDL_IMPL
+
+#if defined(__aarch64__)
+// VADDV: Across vector add
+#define VADDV_IMPL(stype, vtype, prefix, postfix) \
+ inline stype vaddv(const vtype &a) \
+ { \
+ return prefix##_##postfix(a); \
+ }
+
+VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8)
+VADDV_IMPL(int8_t, int8x8_t, vaddv, s8)
+VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16)
+VADDV_IMPL(int16_t, int16x4_t, vaddv, s16)
+VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32)
+VADDV_IMPL(int32_t, int32x2_t, vaddv, s32)
+VADDV_IMPL(float, float32x2_t, vaddv, f32)
+
+VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8)
+VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8)
+VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16)
+VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16)
+VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32)
+VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32)
+VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64)
+VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64)
+VADDV_IMPL(float, float32x4_t, vaddvq, f32)
+#undef VADDV_IMPL
+#endif // defined(__aarch64__)
+
+// VPADDL: Signed add long pairwise
+#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \
+ inline ltype vpaddl(const vtype &a) \
+ { \
+ return prefix##_##postfix(a); \
+ }
+
+VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8)
+VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8)
+VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16)
+VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16)
+VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32)
+VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32)
+
+VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8)
+VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8)
+VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16)
+VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16)
+VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32)
+VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32)
+#undef VPADDL_IMPL
} // namespace wrapper
} // namespace arm_compute
#endif /* __ARM_COMPUTE_WRAPPER_ADD_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/arm_compute/core/NEON/wrapper/intrinsics/eor.h
new file mode 100644
index 000000000..100280832
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/eor.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_EOR_H__
+#define __ARM_COMPUTE_WRAPPER_EOR_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VEOR_IMPL(vtype, prefix, postfix) \
+ inline vtype veor(const vtype &a, const vtype &b) \
+ { \
+ return prefix##_##postfix(a, b); \
+ }
+
+VEOR_IMPL(uint8x8_t, veor, u8)
+VEOR_IMPL(int8x8_t, veor, s8)
+VEOR_IMPL(uint16x4_t, veor, u16)
+VEOR_IMPL(int16x4_t, veor, s16)
+VEOR_IMPL(uint32x2_t, veor, u32)
+VEOR_IMPL(int32x2_t, veor, s32)
+
+VEOR_IMPL(uint8x16_t, veorq, u8)
+VEOR_IMPL(int8x16_t, veorq, s8)
+VEOR_IMPL(uint16x8_t, veorq, u16)
+VEOR_IMPL(int16x8_t, veorq, s16)
+VEOR_IMPL(uint32x4_t, veorq, u32)
+VEOR_IMPL(int32x4_t, veorq, s32)
+
+#undef VEOR_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_EOR_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index 0362ca125..6eae1cf80 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -35,6 +35,7 @@
#include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/div.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/eor.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
@@ -56,6 +57,7 @@
#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/round.h"
#include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h"
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
new file mode 100644
index 000000000..0956959e2
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_REINTERPRET_H__
+#define __ARM_COMPUTE_WRAPPER_REINTERPRET_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+inline int32x4_t vreinterpret_s32(const uint32x4_t &val)
+{
+ return vreinterpretq_s32_u32(val);
+}
+inline int32x4_t vreinterpret_s32(const int32x4_t &val)
+{
+ return val;
+}
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_REINTERPRET_H__ */
diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h
index cc22597c2..d700aacaa 100644
--- a/arm_compute/core/NEON/wrapper/traits.h
+++ b/arm_compute/core/NEON/wrapper/traits.h
@@ -112,6 +112,22 @@ template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float
template <typename T, BitWidth BW> using neon_bitvector_t = typename neon_bitvector<T, BW>::type;
/** Helper type template to get the tag type of a neon vector */
template <typename T, BitWidth BW> using neon_bitvector_tag_t = typename neon_bitvector<T, BW>::tag_type;
+
+/** Promote a type */
+template <typename T> struct promote { };
+template <> struct promote<uint8_t> { using type = uint16_t; };
+template <> struct promote<int8_t> { using type = int16_t; };
+template <> struct promote<uint16_t> { using type = uint32_t; };
+template <> struct promote<int16_t> { using type = int32_t; };
+template <> struct promote<uint32_t> { using type = uint64_t; };
+template <> struct promote<int32_t> { using type = int64_t; };
+template <> struct promote<float> { using type = float; };
+template <> struct promote<half> { using type = half; };
+
+/** Get promoted type */
+template <typename T>
+using promote_t = typename promote<T>::type;
+
// clang-format on
// *INDENT-ON*
} // namespace traits
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 5e6e5b349..949ee66b7 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -250,6 +250,36 @@ inline int8_t quantize_qsymm8(float value, const QuantizationInfo &qinfo)
return quantized;
}
+/** Quantize a value given a 8-bit symmetric per channel quantization scheme
+ *
+ * @param[in] value Value to quantize
+ * @param[in] qinfo Quantization information to use for quantizing
+ * @param[in] channel_id channel index into the scale vector of quantization info
+ *
+ * @return Quantized value
+ */
+inline int8_t quantize_qsymm8_per_channel(float value, const QuantizationInfo &qinfo, size_t channel_id = 0)
+{
+ int quantized = arm_compute::round(value / qinfo.scale()[channel_id], RoundingPolicy::TO_NEAREST_UP);
+ quantized = std::max(-128, std::min(quantized, 127));
+ return quantized;
+}
+
+/** Quantize a value given a 8-bit asymmetric per channel quantization scheme
+ *
+ * @param[in] value Value to quantize
+ * @param[in] qinfo Quantization information to use for quantizing
+ * @param[in] channel_id channel index into the scale vector of quantization info
+ *
+ * @return Quantized value
+ */
+inline int8_t quantize_qasymm8_per_channel(float value, const QuantizationInfo &qinfo, size_t channel_id = 0)
+{
+ int quantized = arm_compute::round(value / qinfo.scale()[channel_id], RoundingPolicy::TO_NEAREST_UP);
+ quantized = std::max(0, std::min(quantized, 255));
+ return quantized;
+}
+
/** Dequantize a value given a 8-bit asymmetric quantization scheme
*
* @param[in] value Value to dequantize
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index f4955ed45..1c9e8ce89 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -77,7 +77,8 @@ enum class DataType
U8, /**< unsigned 8-bit number */
S8, /**< signed 8-bit number */
QSYMM8, /**< quantized, symmetric fixed-point 8-bit number */
- QASYMM8, /**< quantized, asymmetric fixed-point 8-bit number */
+ QASYMM8, /**< quantized, asymmetric fixed-point 8-bit number unsigned */
+ QASYMM8_SIGNED, /**< quantized, asymmetric fixed-point 8-bit number signed */
QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */
QASYMM8_PER_CHANNEL, /**< quantized, asymmetric per channel fixed-point 8-bit number */
U16, /**< unsigned 16-bit number */
@@ -1881,6 +1882,8 @@ struct GEMMLowpOutputStageInfo
int gemmlowp_shift{ 0 }; /**< GEMMLowp output stage shift used for quantizing to uint8 */
int gemmlowp_min_bound{ 0 }; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
int gemmlowp_max_bound{ 0 }; /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
+ std::vector<int> gemmlowp_multipliers{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+ std::vector<int> gemmlowp_shifts{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
};
/** GEMM LHS (Left Hand Side) matrix information */
@@ -2015,6 +2018,14 @@ public:
{
return _gemmlowp_output_stage;
};
+ /** Sets GEMMLowp output stage
+ *
+ * @param[in] output_stage Output stage to set
+ */
+ void set_gemmlowp_output_stage(GEMMLowpOutputStageInfo &output_stage)
+ {
+ _gemmlowp_output_stage = output_stage;
+ };
/** Flag which specifies if a wider accumulator should be used.
*
* @return True if a wider accumulator has to be used
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 3939491bb..a6e1ea1a8 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -114,6 +114,7 @@ inline size_t data_size_from_type(DataType data_type)
case DataType::S8:
case DataType::QSYMM8:
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
case DataType::QASYMM8_PER_CHANNEL:
return 1;
@@ -191,6 +192,7 @@ inline size_t element_size_from_data_type(DataType dt)
case DataType::U8:
case DataType::QSYMM8:
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
return 1;
case DataType::U16:
@@ -533,6 +535,7 @@ inline DataType get_promoted_data_type(DataType dt)
return DataType::S32;
case DataType::QSYMM8:
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
case DataType::QASYMM8_PER_CHANNEL:
case DataType::QSYMM16:
@@ -1024,6 +1027,7 @@ inline bool is_data_type_quantized(DataType dt)
{
case DataType::QSYMM8:
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
case DataType::QASYMM8_PER_CHANNEL:
case DataType::QSYMM16:
@@ -1045,6 +1049,7 @@ inline bool is_data_type_quantized_asymmetric(DataType dt)
switch(dt)
{
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QASYMM8_PER_CHANNEL:
case DataType::QASYMM16:
return true;
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 8ec4a331f..6b6cb007e 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -59,6 +59,21 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int *quant
* @return a status
*/
Status calculate_quantized_multiplier_greater_than_one(float multiplier, int *quantized_multiplier, int *left_shift);
+
+/** Calculate quantized representation of per-channel multipliers with value less than one.
+ *
+ * @param[in] iq_info Input quantization info.
+ * @param[in] wq_info Weights quantization info.
+ * @param[in] oq_info Output quantization info.
+ * @param[in, out] stage_info GemmLowp output stage info
+ *
+ * @return a status
+ */
+Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info,
+ const QuantizationInfo &wq_info,
+ const QuantizationInfo &oq_info,
+ GEMMLowpOutputStageInfo &stage_info);
+
/** Get minimum and maximum values for the input quantized data type
*
* @return min and max values for the quantized data type
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 3e551abf5..2d4aaa495 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -63,14 +63,14 @@ public:
NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = default;
/** Set the input and output tensors.
*
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
* @param[out] output Destination tensor. Data types supported: Same as @p weights.
*/
void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
*
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
* @param[in] output Destination tensor. Data types supported: Same as @p weights.
*
@@ -158,8 +158,8 @@ public:
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
* while every optional dimension from 4 and above represent a batch of inputs.
- * Data types supported: QASYMM8/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+ * Data types supported: QASYMM8/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
* Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
* @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
@@ -178,7 +178,7 @@ public:
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
* while every optional dimension from 4 and above represent a batch of inputs.
* Data types supported: QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
* Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
* @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
@@ -203,7 +203,7 @@ private:
/** Configures the appropriate matrix multiply routine
*
* @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Data type supported: Same as @p input.
+ * @param[in] weights Weights tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
* Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
* @param[out] output Output tensor. Data types supported: Same as @p input,
@@ -215,7 +215,7 @@ private:
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
*
* @param[in] input Input tensor. Data types supported: QASYMM8/F16/F32.
- * @param[in] weights Weights tensor. Data type supported: Same as @p input.
+ * @param[in] weights Weights tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
* @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
* Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
* @param[in] output Output tensor. Data types supported: Same as @p input,
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
index 4d7f67b94..ec56d752b 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,7 +40,7 @@ class NEGEMMInterleave4x4 : public INESimpleFunctionNoBorder
public:
/** Initialise the kernel's inputs, output
*
- * @param[in] input First input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in] input First input tensor. Data types supported: All
* @param[out] output Output tensor. Data type supported: same as @p input
*/
void configure(const ITensor *input, ITensor *output);
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 12c120934..aa2c23c97 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -26,6 +26,8 @@
#include "NEActivationLayer.h"
#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
@@ -76,24 +78,24 @@ public:
* -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
* -# Compute the matrix product of the resulting a * b in int32.
*
- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8 otherwise
+ * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
*
- * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8.
+ * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
* @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
* @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32
- * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8
+ * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should be executed only for the first run
*/
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
*
- * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8 otherwise
+ * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
*
- * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8.
+ * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
* @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a
* @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
- * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8
+ * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should be executed only for the first run
*
@@ -116,21 +118,28 @@ private:
NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
NEActivationLayer _activation_func;
- Tensor _vector_sum_col;
- Tensor _vector_sum_row;
- Tensor _tmp_a;
- Tensor _tmp_b;
- Tensor _mm_result_s32;
- const ITensor *_original_b;
- int32_t _a_offset;
- int32_t _b_offset;
- bool _run_vector_matrix_multiplication;
- bool _assembly_path;
- bool _fused_assembly_path;
- bool _reshape_b_only_on_first_run;
- bool _is_prepared;
- bool _fuse_output_stage;
- bool _run_activation;
+ NEConvertQuantizedSignednessKernel _convert_to_signed_asymm;
+ NEConvertQuantizedSignednessKernel _convert_from_signed_asymm;
+
+ Tensor _vector_sum_col;
+ Tensor _vector_sum_row;
+ Tensor _tmp_a;
+ Tensor _tmp_b;
+ Tensor _mm_result_s32;
+ Tensor _signed_a;
+ Tensor _signed_output;
+ const ITensor *_original_b;
+ int32_t _a_offset;
+ int32_t _b_offset;
+
+ bool _run_vector_matrix_multiplication;
+ bool _assembly_path;
+ bool _fused_assembly_path;
+ bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
+ bool _fuse_output_stage;
+ bool _run_activation;
+ bool _flip_signedness;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
index b44c5a3ee..f5ba08bdd 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,13 +40,13 @@ class NEGEMMTranspose1xW : public INESimpleFunctionNoBorder
public:
/** Initialise the kernel's inputs, output
*
- * @param[in] input First input tensor. Data type supported: U8/S8/U16/S16/F16/U32/S32/F32/
+ * @param[in] input First input tensor. Data type supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor. Data type supported: same as @p input
*/
void configure(const ITensor *input, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xW
*
- * @param[in] input First input tensor. Data type supported: U8/S8/U16/S16/F16/U32/S32/F32/
+ * @param[in] input First input tensor. Data type supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32
* @param[in] output Output tensor. Data type supported: same as @p input
*
* @return a status
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
new file mode 100644
index 000000000..39e030e43
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Output auto inizialitation if not yet initialized
+ {
+ const bool is_input_signed = input->data_type() == DataType::QASYMM8_SIGNED;
+ const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo qinfo = input->quantization_info().uniform();
+ const int offset_correction = is_input_signed ? -128 : 128;
+ const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+
+ auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
+ }
+
+ return std::make_pair(Status{}, calculate_max_window(*output));
+}
+} // namespace
+
+NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ std::pair<Status, Window> win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ const uint8_t mask = 128;
+ const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
+
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+ *(output_ptr + x) = in ^ mask;
+ }
+ },
+ input, output);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index c92998316..a9c04824a 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -45,9 +45,7 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
- DataType::U16, DataType::S16, DataType::U32, DataType::S32,
- DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
if(output->total_size() != 0)
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 6cec51d5a..8f5a208cb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -722,8 +722,8 @@ namespace
{
Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
TensorShape in0_shape = input0->tensor_shape();
@@ -917,6 +917,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
switch(_input0->info()->data_type())
{
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
{
matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
break;
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 46e53cec1..3ada3a3c4 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -72,6 +72,58 @@ inline int32x4x4_t load(const int32_t *ptr, int32_t x)
};
}
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
+{
+ return
+ {
+ {
+ vaddq_s32(a.val[0], b),
+ vaddq_s32(a.val[1], b),
+ vaddq_s32(a.val[2], b),
+ vaddq_s32(a.val[3], b)
+ }
+ };
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
+{
+ return
+ {
+ {
+ vaddq_s32(a.val[0], b.val[0]),
+ vaddq_s32(a.val[1], b.val[1]),
+ vaddq_s32(a.val[2], b.val[2]),
+ vaddq_s32(a.val[3], b.val[3])
+ }
+ };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
+{
+ return
+ {
+ {
+ vmulq_n_s32(a.val[0], mul_scalar),
+ vmulq_n_s32(a.val[1], mul_scalar),
+ vmulq_n_s32(a.val[2], mul_scalar),
+ vmulq_n_s32(a.val[3], mul_scalar)
+ }
+ };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
+{
+ return
+ {
+ {
+ vmulq_s32(a.val[0], vld1q_s32(multilpier)),
+ vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+ vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
+ vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
+ }
+ };
+}
+
inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
{
int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
@@ -141,6 +193,82 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
return out_u8;
}
+template <bool is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+ const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+ // Shift final result (negative value shift right)
+ in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+ in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+ in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+ in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+ // Saturate negative values
+ in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+ in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+ in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+ in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+ // Convert S32 to S16
+ const int16x8x2_t in_s16 =
+ {
+ {
+ vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+ }
+ };
+
+ // Convert S16 to S8
+ int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+ if(is_bounded_relu)
+ {
+ out_s8 = vmaxq_s8(out_s8, min_s8);
+ out_s8 = vminq_s8(out_s8, max_s8);
+ }
+
+ return out_s8;
+}
+
+template <bool is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+ const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+ // Shift final result (negative value shift right)
+ in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0]));
+ in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1]));
+ in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2]));
+ in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3]));
+
+ // Saturate negative values
+ in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+ in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+ in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+ in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+ // Convert S32 to S16
+ const int16x8x2_t in_s16 =
+ {
+ {
+ vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+ }
+ };
+
+ // Convert S16 to S8
+ int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+ if(is_bounded_relu)
+ {
+ out_s8 = vmaxq_s8(out_s8, min_s8);
+ out_s8 = vminq_s8(out_s8, max_s8);
+ }
+
+ return out_s8;
+}
+
inline Window get_win_vector_sum(const Window &window)
{
Window win_vector_sum(window);
@@ -172,50 +300,12 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
return bias_it;
}
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
-{
- return
- {
- {
- vaddq_s32(a.val[0], b),
- vaddq_s32(a.val[1], b),
- vaddq_s32(a.val[2], b),
- vaddq_s32(a.val[3], b)
- }
- };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
-{
- return
- {
- {
- vaddq_s32(a.val[0], b.val[0]),
- vaddq_s32(a.val[1], b.val[1]),
- vaddq_s32(a.val[2], b.val[2]),
- vaddq_s32(a.val[3], b.val[3])
- }
- };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
-{
- return
- {
- {
- vmulq_n_s32(a.val[0], mul_scalar),
- vmulq_n_s32(a.val[1], mul_scalar),
- vmulq_n_s32(a.val[2], mul_scalar),
- vmulq_n_s32(a.val[3], mul_scalar)
- }
- };
-}
-
template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8,
int32_t a_offset, int32_t b_offset, int32_t k_offset,
- GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x)
+ int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
+ int window_step_x, int window_start_x, int window_end_x)
{
int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
if(!is_fixed_point)
@@ -251,12 +341,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
}
if(!is_fixed_point)
{
- in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier);
+ in_s32 = mul_s32(in_s32, multiplier);
}
if(is_fixed_point)
{
- vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8));
+ vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8));
}
else
{
@@ -280,24 +370,99 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
if(is_fixed_point)
{
// Finalize and store the result
- *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift,
- output_stage.gemmlowp_offset, static_cast<uint8_t>(output_stage.gemmlowp_min_bound), static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+ *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset, static_cast<uint8_t>(min_bound), static_cast<uint8_t>(max_bound));
}
else
{
// Finalize quantization
- in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift;
+ in_value = (in_value * multiplier) >> shift;
// Bound and store the result
if(is_bounded_relu)
{
- in_value = static_cast<uint8_t>(std::max<int32_t>(output_stage.gemmlowp_min_bound, std::min<int32_t>(output_stage.gemmlowp_max_bound, in_value)));
+ in_value = static_cast<uint8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
}
*(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
}
}
}
+template <bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
+ const int32_t *result_multipliers, const int32_t *result_shifts,
+ const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
+ int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
+ int window_step_x, int window_start_x, int window_end_x)
+{
+ int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
+ if(!is_fixed_point)
+ {
+ // Combine quantization offset with other offsets.
+ offset_term_s32 = add_s32(offset_term_s32, result_offset);
+ }
+
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+ if(has_a_offset)
+ {
+ in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+ }
+ if(has_bias)
+ {
+ in_s32 = add_s32(in_s32, load(bias_ptr, x));
+ }
+ if(!is_fixed_point)
+ {
+ in_s32 = add_s32(in_s32, offset_term_s32);
+ in_s32 = mul_s32(in_s32, result_multipliers + x);
+ }
+
+ if(is_fixed_point)
+ {
+ vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm<is_bounded_relu>(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8));
+ }
+ else
+ {
+ vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point<is_bounded_relu>(in_s32, load(result_shifts, x), min_s8, max_s8));
+ }
+ }
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+ if(has_a_offset)
+ {
+ in_value += (*(vector_sum_col_ptr + x) * a_offset);
+ }
+ if(has_bias)
+ {
+ in_value += *(bias_ptr + x);
+ }
+
+ if(is_fixed_point)
+ {
+ // Finalize and store the result
+ *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound));
+ }
+ else
+ {
+ // Finalize quantization
+ in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
+
+ // Bound and store the result
+ if(is_bounded_relu)
+ {
+ in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+ }
+ *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+ }
+ }
+}
+
template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
void run_offset_contribution_output_stage(const Window &window,
const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
@@ -307,10 +472,16 @@ void run_offset_contribution_output_stage(const Window &window,
const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
- const int32x4_t result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset);
- const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift);
- const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_min_bound));
- const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+ const int32_t multiplier = output_stage.gemmlowp_multiplier;
+ const int32_t shift = output_stage.gemmlowp_shift;
+ const int32_t offset = output_stage.gemmlowp_offset;
+ const int32_t min_bound = output_stage.gemmlowp_min_bound;
+ const int32_t max_bound = output_stage.gemmlowp_max_bound;
+
+ const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
+ const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift);
+ const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(min_bound));
+ const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(max_bound));
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -349,7 +520,8 @@ void run_offset_contribution_output_stage(const Window &window,
run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
}
@@ -363,7 +535,8 @@ void run_offset_contribution_output_stage(const Window &window,
+ id.y() + (id.z() % depth_input) * height_input;
run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
}
@@ -386,7 +559,8 @@ void run_offset_contribution_output_stage(const Window &window,
+ id.y() + (id.z() % depth_input) * height_input;
run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_row_it, bias_it, mm_result_it, out_it);
}
@@ -399,7 +573,8 @@ void run_offset_contribution_output_stage(const Window &window,
+ id.y() + (id.z() % depth_input) * height_input;
run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_row_it, mm_result_it, out_it);
}
@@ -422,7 +597,8 @@ void run_offset_contribution_output_stage(const Window &window,
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, bias_it, mm_result_it, out_it);
}
@@ -434,7 +610,8 @@ void run_offset_contribution_output_stage(const Window &window,
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, mm_result_it, out_it);
}
@@ -448,7 +625,8 @@ void run_offset_contribution_output_stage(const Window &window,
{
run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
bias_it, mm_result_it, out_it);
}
@@ -458,7 +636,110 @@ void run_offset_contribution_output_stage(const Window &window,
{
run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ mm_result_it, out_it);
+ }
+ return;
+ }
+}
+
+template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+void run_offset_contribution_output_stage_symm(const Window &window,
+ const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+ int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
+ GEMMLowpOutputStageInfo output_stage)
+{
+ ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
+
+ const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+ const int32_t offset = output_stage.gemmlowp_offset;
+ const int32_t min_bound = output_stage.gemmlowp_min_bound;
+ const int32_t max_bound = output_stage.gemmlowp_max_bound;
+
+ const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
+ const int32_t *result_shifts = output_stage.gemmlowp_shifts.data();
+ const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
+ const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(min_bound));
+ const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(max_bound));
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win(window);
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+ Iterator mm_result_it(mm_result, win);
+ Iterator out_it(output, win);
+
+ if(a_offset != 0)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+ Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+ // Offset in case vector_sum_col is batched
+ const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ run_offset_contribution_output_stage_window_symm<true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ run_offset_contribution_output_stage_window_symm<true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, mm_result_it, out_it);
+ }
+ }
+ else
+ {
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window_symm<false, true, is_bounded_relu, is_fixed_point>(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window_symm<false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
mm_result_it, out_it);
}
@@ -470,8 +751,18 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+ if(output->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 127);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < -128);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
if(bias != nullptr)
@@ -525,7 +816,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
}
@@ -551,7 +842,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result,
}
NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
-get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage)
+get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
{
static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function =
{
@@ -562,7 +853,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
{ 4, &run_offset_contribution_output_stage<false, false, true> },
{ 5, &run_offset_contribution_output_stage<true, false, true> },
{ 6, &run_offset_contribution_output_stage<false, true, true> },
- { 7, &run_offset_contribution_output_stage<true, true, true> }
+ { 7, &run_offset_contribution_output_stage_symm<true, true, true> },
+ { 8, &run_offset_contribution_output_stage_symm<false, false, false> },
+ { 9, &run_offset_contribution_output_stage_symm<true, false, false> },
+ { 10, &run_offset_contribution_output_stage_symm<false, true, false> },
+ { 11, &run_offset_contribution_output_stage_symm<true, true, false> },
+ { 12, &run_offset_contribution_output_stage_symm<false, false, true> },
+ { 13, &run_offset_contribution_output_stage_symm<true, false, true> },
+ { 14, &run_offset_contribution_output_stage_symm<false, true, true> },
+ { 15, &run_offset_contribution_output_stage_symm<true, true, true> }
};
// Check if input is a 3D reinterpretation
@@ -574,11 +873,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound)
&& !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255));
+ // Check if we need to perform fixed point requantization
const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+ // Check if symmetric per-channel execution
+ const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
// key acts as a bitset, setting the first bit on reinterpret_as_3d,
// the second on is_bounded_relu, and the third on is_fixed_point.
- uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
+ uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3);
return map_function.find(key)->second;
}
} // namespace
@@ -591,8 +894,9 @@ NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutpu
}
void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
- const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k,
- int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+ const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+ int32_t k, int32_t a_offset, int32_t b_offset,
+ GEMMLowpOutputStageInfo output_stage)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
@@ -627,7 +931,7 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
- _function = get_configured_function(mm_result, vector_sum_row, output_stage);
+ _function = get_configured_function(mm_result, vector_sum_row, output, output_stage);
}
Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index c1ee770db..72632492d 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,13 +27,13 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include <arm_neon.h>
#include <cstddef>
#include <cstdint>
@@ -48,7 +48,7 @@ namespace
{
Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
return Status{};
@@ -72,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITens
Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
return Status{};
@@ -128,11 +128,12 @@ Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, cons
return Status{};
}
-void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ // Intermediate and final accumulator types
+ using TIAcc = wrapper::traits::promote_t<T>;
+ using TAcc = wrapper::traits::promote_t<TIAcc>;
Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
@@ -149,9 +150,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4_t sum_row = vdupq_n_u32(0);
+ auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
- const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+ const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -161,43 +162,41 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
// This for loop performs 4 accumulations
for(; i <= (_k - 4); i += 4)
{
- const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
+ const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
- // Convert U8 to U16
- uint16x4x4_t a0_u16 =
+ // Convert 8-bit to 16-bit
+ typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
{
- {
- vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
- }
+ wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+ wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+ wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
+ wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
};
- // Accumulate to U16
- a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
- a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
- a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
+ // Accumulate to 16-bit
+ a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
+ a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
+ a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
- // Accumulate to U32
- sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
+ // Accumulate to 32-bit
+ sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
}
// This for loop performs the leftover accumulations
for(; i < _k; ++i)
{
- const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
+ const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
// Convert U8 to U16
- const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
+ const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
// Accumulate to U32
- sum_row = vaddw_u16(sum_row, a0_u16);
+ sum_row = wrapper::vaddw(sum_row, a0_d16);
}
auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
- vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
+ wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row));
},
in, out);
}
@@ -206,10 +205,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4_t sum_row_u32 = vdupq_n_u32(0);
- uint32_t sum_row = 0;
+ auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+ TAcc sum_row = 0;
- const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+ const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -219,37 +218,57 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
// This for loop performs 16 accumulations
for(; i <= (_k - 16); i += 16)
{
- const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
+ const auto a0_d8 = wrapper::vloadq(matrix_a + i);
// Partial accumulations in U16
- const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
+ const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
// Accumulate to U32
- sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
+ vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
}
// This for loop performs the leftover accumulations
for(; i < _k; ++i)
{
- sum_row += static_cast<uint32_t>(matrix_a[i]);
+ sum_row += static_cast<TAcc>(matrix_a[i]);
}
#if defined(__aarch64__)
// Reduction operation available on 64 bit architectures only
- sum_row += vaddvq_u32(sum_row_u32);
+ sum_row += wrapper::vaddv(vsum_row);
#else // __aarch64__
- uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
- tmp = vpadd_u32(tmp, tmp);
+ auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+ tmp = wrapper::vpadd(tmp, tmp);
- sum_row += vget_lane_u32(tmp, 0);
+ sum_row += wrapper::vgetlane(tmp, 0);
#endif // __aarch64__
- *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
+ *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
},
in, out);
}
}
+void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ run_internal<uint8_t>(window);
+ break;
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ run_internal<int8_t>(window);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+}
+
void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
@@ -276,11 +295,12 @@ Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, cons
return Status{};
}
-void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ // Intermediate and final accumulator types
+ using TIAcc = wrapper::traits::promote_t<T>;
+ using TAcc = wrapper::traits::promote_t<TIAcc>;
Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
@@ -297,17 +317,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4x4_t sum_col =
+ typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
{
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
- }
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
};
- const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
+ const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -316,35 +334,28 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
int i = 0;
for(; i < _k; ++i)
{
- const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
+ const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
- // Convert S8 to U16
- const uint16x8x2_t b0_u16 =
+ // Convert 8bit to 16bit
+ const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
{
- {
- vmovl_u8(vget_low_u8(b0_u8)),
- vmovl_u8(vget_high_u8(b0_u8))
- }
+ wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+ wrapper::vmovl(wrapper::vgethigh(b0_b8))
};
// Accumulate to U32
- sum_col =
- {
- {
- vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
- vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
- }
- };
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
}
auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
- vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
- vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
- vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
- vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+ wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+ wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+ wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+ wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
},
in, out);
}
@@ -377,17 +388,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
}
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4x4_t sum_col =
+ typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
{
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
- }
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
};
- const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
+ const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -398,10 +407,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
// This for loop performs 4 accumulations
for(; i <= (_k - 4); i += 4)
{
- const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
- const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
- const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
- const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
+ const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+ const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+ const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+ const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
#if __arm__
asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
@@ -410,34 +419,27 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
#endif /* __arm__ */
- // Partial accumulation in u16
- uint16x8x2_t tmp_sum =
+ // Partial accumulation in 16bit
+ typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
{
- {
- vdupq_n_u16(0),
- vdupq_n_u16(0)
- }
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
};
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
-
- // Accumulate to U32
- sum_col =
- {
- {
- vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
- vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
- vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
- vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
- }
- };
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
matrix_b += 4 * in_b_stride;
}
@@ -445,38 +447,51 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
// This for loop perfoms the leftover accumulations
for(; i < _k; ++i)
{
- const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
+ const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
// Convert S8 to S16
- const uint16x8x2_t b0_u16 =
+ const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
{
- {
- vmovl_u8(vget_low_u8(b0_u8)),
- vmovl_u8(vget_high_u8(b0_u8))
- }
+ wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+ wrapper::vmovl(wrapper::vgethigh(b0_b8))
};
- // Accumulate to U32
- sum_col =
- {
- {
- vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
- vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
- }
- };
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
matrix_b += in_b_stride;
}
auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
- vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
- vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
- vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
- vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+ wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+ wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+ wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+ wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
},
inb, out);
}
}
+
+void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ run_internal<uint8_t>(window, info);
+ break;
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ run_internal<int8_t>(window, info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+}
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 0ca7fd3dc..ea3d32e62 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -55,7 +55,7 @@ TensorShape get_output_shape(const ITensorInfo *input)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::U8, DataType::S8,
DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 624833adf..649316442 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -49,7 +49,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
if(biases != nullptr)
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index fa335d757..6d276d132 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -162,6 +162,7 @@ const std::string &arm_compute::string_from_data_type(DataType dt)
{ DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
{ DataType::QASYMM8_PER_CHANNEL, "QASYMM8_PER_CHANNEL" },
{ DataType::QASYMM8, "QASYMM8" },
+ { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
{ DataType::QSYMM16, "QSYMM16" },
{ DataType::QASYMM16, "QASYMM16" },
};
@@ -292,6 +293,7 @@ std::string arm_compute::string_from_pixel_value(const PixelValue &value, const
converted_string = ss.str();
break;
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
// Needs conversion to 32 bit, otherwise interpreted as ASCII values
ss << int32_t(value.get<int8_t>());
@@ -448,6 +450,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const
print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
break;
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
break;
@@ -485,6 +488,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp
case DataType::QASYMM8_PER_CHANNEL:
return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
case DataType::U16:
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index cdd48972e..386d75eca 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -108,6 +108,44 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier,
return Status{};
}
+
+arm_compute::Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info,
+ const QuantizationInfo &wq_info,
+ const QuantizationInfo &oq_info,
+ GEMMLowpOutputStageInfo &stage_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(oq_info.scale().empty());
+
+ const unsigned int size = wq_info.scale().size();
+
+ auto &quant_multipliers = stage_info.gemmlowp_multipliers;
+ auto &quant_shifts = stage_info.gemmlowp_shifts;
+ quant_multipliers.resize(size);
+ quant_shifts.resize(size);
+
+ const auto &w_scales = wq_info.scale();
+ const float i_scale = iq_info.scale().at(0);
+ const float o_scale = oq_info.scale().at(0);
+
+ for(unsigned int i = 0; i < size; ++i)
+ {
+ const float multiplier = i_scale * w_scales[i] / o_scale;
+ int quant_multiplier = 0;
+ int quant_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier_less_than_one(multiplier, &quant_multiplier, &quant_shift));
+ quant_multipliers[i] = quant_multiplier;
+ quant_shifts[i] = quant_shift;
+ }
+
+ // Legacy part
+ stage_info.gemmlowp_shift = quant_shifts[0];
+ stage_info.gemmlowp_multiplier = quant_multipliers[0];
+
+ return Status{};
+}
+
std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_type)
{
int min_quant_val = 0;
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index b31ecb91e..43e531579 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -450,13 +450,24 @@ Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo
#ifndef __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64");
#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+ DataType::F16, DataType::F32);
+ if(is_data_type_quantized_per_channel(b->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8_SIGNED && d->data_type() != DataType::S32, "Only S32 output supported for QASYMM8_SIGNED input");
return Status{};
}
@@ -495,6 +506,7 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const
}
break;
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
break;
#endif /* __aarch64__ */
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index f4377cdaf..caff117e0 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -59,7 +59,7 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I
Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
if(biases != nullptr)
@@ -114,18 +114,18 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
- const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
-
- input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
- weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
- const UniformQuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier;
- int output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ const QuantizationInfo iqinfo = input->info()->quantization_info();
+ const QuantizationInfo wqinfo = weights->info()->quantization_info();
+ const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
+ const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+
+ input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+ if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+ {
+ const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+ weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+ }
// Merge activation with output stage
int min_activation = 0;
@@ -133,26 +133,25 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
if(supported_acts.count(act_info.activation()) != 0)
{
- const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
- const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+ const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+ const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
}
GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = oqinfo.offset;
- output_info.gemmlowp_multiplier = output_multiplier;
- output_info.gemmlowp_shift = output_shift;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
- input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset));
- weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, wqinfo.offset));
+ input->info()->set_quantization_info(iqinfo);
+ weights->info()->set_quantization_info(wqinfo);
}
else
{
@@ -176,20 +175,10 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
- const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-
- std::unique_ptr<ITensorInfo> input_qa = input->clone();
- std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
- weights_qa->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
- const UniformQuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier;
- int output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+ const QuantizationInfo &iqinfo = input->quantization_info();
+ const QuantizationInfo &wqinfo = weights->quantization_info();
+ const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
// Merge activation with output stage
int min_activation = 0;
@@ -201,22 +190,25 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
};
if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
{
- const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
- const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+ const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+ const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
}
GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = oqinfo.offset;
- output_info.gemmlowp_multiplier = output_multiplier;
- output_info.gemmlowp_shift = output_shift;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info));
// Perform validation step on GEMMLowp
+ std::unique_ptr<ITensorInfo> input_qa = input->clone();
+ std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+ input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+ weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
}
else
@@ -396,7 +388,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 63f330be6..a478fdd23 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,11 +26,12 @@
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
{
auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
k->configure(input, output);
_kernel = std::move(k);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 617d66cf2..01a99f7ac 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,9 +42,9 @@ using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr),
- _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
- _fuse_output_stage(false), _run_activation(false)
+ _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+ _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+ _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
{
}
@@ -56,6 +56,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
const ITensor *matrix_a = a;
const ITensor *matrix_b = b;
+ GEMMInfo info = gemm_info;
// Clear state
_mtx_a_reshape_kernel = nullptr;
@@ -65,13 +66,41 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_a_offset = a->info()->quantization_info().uniform().offset;
_b_offset = b->info()->quantization_info().uniform().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
_is_prepared = false;
_fused_assembly_path = false;
+ _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
_original_b = b;
+ const ITensor *a_to_use = a;
+
+ // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+ if(_flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
+
+ _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
+ _memory_group.manage(&_signed_a);
+ _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+ a_to_use = &_signed_a;
+ _a_offset = _signed_a.info()->quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
+ _memory_group.manage(&_signed_output);
+ _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+ }
+
// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
{
_fuse_output_stage = true;
_memory_group.manage(&_mm_result_s32);
@@ -83,17 +112,18 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
switch(a->info()->data_type())
{
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::U8:
case DataType::S8:
{
- if(a->info()->data_type() == DataType::QASYMM8 && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- _asm_glue.configure(a, b, c, output, gemm_info);
+ _asm_glue.configure(a_to_use, b, c, output, gemm_info);
_fused_assembly_path = _asm_glue.is_configured();
}
else
{
- _asm_glue.configure(a, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
+ _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
}
_assembly_path = _asm_glue.is_configured();
break;
@@ -111,7 +141,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
matrix_b = &_tmp_b;
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+ TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
_tmp_a.allocator()->init(a_info);
@@ -125,7 +155,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
// Configure interleave kernel
{
auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a, &_tmp_a);
+ k->configure(a_to_use, &_tmp_a);
_mtx_a_reshape_kernel = std::move(k);
}
@@ -151,19 +181,19 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
}
// Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
{
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
+ TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
_vector_sum_row.allocator()->init(info_vector_sum_row);
_memory_group.manage(&_vector_sum_row);
// Configure matrix A reduction kernel
- _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+ _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);
}
if(_fuse_output_stage)
@@ -176,8 +206,17 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_mm_kernel = std::move(k);
}
- _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
- _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+ _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
+ _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : output,
+ a->info()->dimension(0),
+ _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+ if(_flip_signedness)
+ {
+ _convert_from_signed_asymm.configure(&_signed_output, output);
+ }
}
else
{
@@ -189,7 +228,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_mm_kernel = std::move(k);
}
// Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
}
}
@@ -228,22 +267,31 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
{
_mm_result_s32.allocator()->allocate();
}
+
+ if(_flip_signedness)
+ {
+ _signed_a.allocator()->allocate();
+ _signed_output.allocator()->allocate();
+ }
}
Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ GEMMInfo info = gemm_info;
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
+ const ITensorInfo *a_to_use = a;
+
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
TensorInfo mm_result_s32_info{};
@@ -251,31 +299,57 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
int32_t a_offset = a->quantization_info().uniform().offset;
int32_t b_offset = b->quantization_info().uniform().offset;
- bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
if(fuse_output_stage)
{
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
}
+ // Convert QASYMM8->QASYMM8_SIGNED
+ TensorInfo signed_a{};
+ TensorInfo signed_output{};
+ bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+ if(flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
+
+ signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+ a_to_use = &signed_a;
+ a_offset = signed_a.quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+ signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+ }
+
// Check if we need to run the optimized assembly kernel
bool run_optimised = false;
bool run_optimised_requantized = false;
- if(is_data_type_quantized_asymmetric(a->data_type()))
+ if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
run_optimised_requantized = run_optimised;
}
else
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
}
if(run_optimised)
{
ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
+ if(info.depth_output_gemm3d() != 0)
{
- if(gemm_info.reinterpret_input_as_3d())
+ if(info.reinterpret_input_as_3d())
{
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -292,8 +366,8 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
if(!run_vector_matrix_multiplication)
@@ -312,10 +386,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
}
}
@@ -340,7 +414,7 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
// Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));
}
if(fuse_output_stage)
@@ -354,8 +428,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
b_offset == 0 ? nullptr : &info_vector_sum_row,
- c, output, a_offset, b_offset,
- gemm_info.gemmlowp_output_stage()));
+ c,
+ flip_signedness ? &signed_output : output,
+ a_offset, b_offset,
+ info.gemmlowp_output_stage()));
}
else
{
@@ -397,6 +473,12 @@ void NEGEMMLowpMatrixMultiplyCore::run()
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
+ // Convert QASYMM8->QASYMM8_SIGNED
+ if(_flip_signedness)
+ {
+ NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+ }
+
// Run GEMM
if(_asm_glue.is_configured())
{
@@ -433,6 +515,12 @@ void NEGEMMLowpMatrixMultiplyCore::run()
}
}
+ // Convert QASYMM8_SIGNED->QASYMM8
+ if(_flip_signedness)
+ {
+ NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+ }
+
// Run fused activation
if(_run_activation)
{
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index f535f1630..280f6ddbd 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -640,6 +640,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
}
case DataType::S8:
case DataType::QSYMM8:
+ case DataType::QASYMM8_SIGNED:
{
std::uniform_int_distribution<int8_t> distribution_s8(std::numeric_limits<int8_t>::lowest(), std::numeric_limits<int8_t>::max());
fill(tensor, distribution_s8, seed_offset);
diff --git a/tests/Utils.h b/tests/Utils.h
index 0c4aeb61f..6b3935e52 100644
--- a/tests/Utils.h
+++ b/tests/Utils.h
@@ -359,6 +359,7 @@ void store_value_with_data_type(void *ptr, T value, DataType data_type)
*reinterpret_cast<uint8_t *>(ptr) = value;
break;
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8:
case DataType::QSYMM8_PER_CHANNEL:
*reinterpret_cast<int8_t *>(ptr) = value;
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 415879329..95a554862 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -326,6 +326,24 @@ std::pair<int, int> get_quantized_bounds(const QuantizationInfo &quant_info, flo
return std::pair<int, int> { min_bound, max_bound };
}
+std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(min > max, "min must be lower equal than max");
+
+ const int min_bound = quantize_qsymm8_per_channel(min, quant_info, channel_id);
+ const int max_bound = quantize_qsymm8_per_channel(max, quant_info, channel_id);
+ return std::pair<int, int> { min_bound, max_bound };
+}
+
+std::pair<int, int> get_asymm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(min > max, "min must be lower equal than max");
+
+ const int min_bound = quantize_qasymm8_per_channel(min, quant_info, channel_id);
+ const int max_bound = quantize_qasymm8_per_channel(max, quant_info, channel_id);
+ return std::pair<int, int> { min_bound, max_bound };
+}
+
template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord);
template void get_tile(const SimpleTensor<int> &in, SimpleTensor<int> &roi, const Coordinates &coord);
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index 2ee2dc7aa..2c1df39f1 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -276,6 +276,24 @@ void zeros(SimpleTensor<T> &in, const Coordinates &anchor, const TensorShape &sh
* @param[in] max Floating point maximum value to be quantized
*/
std::pair<int, int> get_quantized_bounds(const QuantizationInfo &quant_info, float min, float max);
+
+/** Helper function to compute symmetric quantized min and max bounds
+ *
+ * @param[in] quant_info Quantization info to be used for conversion
+ * @param[in] min Floating point minimum value to be quantized
+ * @param[in] max Floating point maximum value to be quantized
+ * @param[in] channel_id Channel id for per channel quantization info.
+ */
+std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id = 0);
+
+/** Helper function to compute asymmetric quantized min and max bounds
+ *
+ * @param[in] quant_info Quantization info to be used for conversion
+ * @param[in] min Floating point minimum value to be quantized
+ * @param[in] max Floating point maximum value to be quantized
+ * @param[in] channel_id Channel id for per channel quantization info.
+ */
+std::pair<int, int> get_asymm_quantized_per_channel_bounds(const QuantizationInfo &quant_info, float min, float max, size_t channel_id = 0);
} // namespace validation
} // namespace test
} // namespace arm_compute
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index ceecd5805..df52d8065 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -74,6 +74,13 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f)
});
+
+const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+{
+ QuantizationInfo(0.5f, 10),
+ QuantizationInfo(0.3f, 3),
+ QuantizationInfo(1.f, 10),
+});
} // namespace
TEST_SUITE(NEON)
@@ -422,6 +429,9 @@ TEST_SUITE_END() // Float
template <typename T>
using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+template <typename T>
+using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T, int8_t>;
+
const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
{
ActivationLayerInfo(),
@@ -451,6 +461,33 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerQuantizedFixture<uint8_t>
validate(Accessor(_target), _reference, tolerance_qasymm8);
}
TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+ combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerReducedDataset(),
+ framework::dataset::make("ReshapeWeights", { true })),
+ framework::dataset::make("DataType", { DataType::QASYMM8 })),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ QuantizationData),
+ ActivationFunctionsDataset),
+ framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+ combine(combine(combine(combine(combine(combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()),
+ framework::dataset::make("ReshapeWeights", { true })),
+ framework::dataset::make("DataType", { DataType::QASYMM8 })),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+ QuantizationData),
+ QuantizedActivationFunctionsDataset),
+ framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
TEST_SUITE_END() // Quantized
TEST_SUITE_END() // GEMMConvolutionLayer
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index 52fa8da60..c5cddc28d 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -48,7 +48,7 @@ namespace test
{
namespace validation
{
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
class ConvolutionValidationGenericFixture : public framework::Fixture
{
public:
@@ -57,13 +57,15 @@ public:
public:
template <typename...>
void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights,
- DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
+ DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info)
{
- _data_type = data_type;
- _is_quantized = is_data_type_quantized_asymmetric(data_type);
- _bias_data_type = _is_quantized ? DataType::S32 : data_type;
- _quantization_info = quantization_info;
- _data_layout = data_layout;
+ _data_type = data_type;
+ _weights_data_type = weights_data_type;
+ _is_quantized = is_data_type_quantized_asymmetric(data_type);
+ _bias_data_type = _is_quantized ? DataType::S32 : data_type;
+ _quantization_info = quantization_info;
+ _weight_quantization_info = weight_quantization_info;
+ _data_layout = data_layout;
_target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info);
_reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info);
@@ -82,6 +84,26 @@ protected:
library->fill(tensor, distribution, i);
break;
}
+ case DataType::QSYMM8_PER_CHANNEL:
+ {
+ int min_bound = 128;
+ int max_bound = -127;
+ for(size_t i = 0; i < _weight_quantization_info.scale().size(); i++)
+ {
+ std::pair<int, int> bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f, i);
+ if(bounds.first < min_bound)
+ {
+ min_bound = bounds.first;
+ }
+ if(bounds.second > max_bound)
+ {
+ max_bound = bounds.second;
+ }
+ }
+ std::uniform_int_distribution<int8_t> distribution(min_bound, max_bound);
+ library->fill(tensor, distribution, i);
+ break;
+ }
case DataType::S32:
{
std::uniform_int_distribution<int32_t> distribution(-100, 100);
@@ -122,7 +144,7 @@ protected:
// Create tensors
TensorType src = create_tensor<TensorType>(input_shape, _data_type, 1, _quantization_info, _data_layout);
- TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _data_type, 1, _quantization_info, _data_layout);
+ TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, _weights_data_type, 1, _weight_quantization_info, _data_layout);
TensorType bias = create_tensor<TensorType>(bias_shape, _bias_data_type, 1, _quantization_info, _data_layout);
TensorType dst = create_tensor<TensorType>(output_shape, _data_type, 1, _quantization_info, _data_layout);
@@ -166,7 +188,7 @@ protected:
// Create reference
SimpleTensor<T> src{ input_shape, _data_type, 1, _quantization_info };
- SimpleTensor<T> weights{ weights_shape, _data_type, 1, _quantization_info };
+ SimpleTensor<TW> weights{ weights_shape, _weights_data_type, 1, _weight_quantization_info };
SimpleTensor<TBias> bias{ bias_shape, _bias_data_type, 1, _quantization_info };
// Fill reference
@@ -182,36 +204,59 @@ protected:
TensorType _target{};
SimpleTensor<T> _reference{};
DataType _data_type{};
+ DataType _weights_data_type{};
DataType _bias_data_type{};
DataLayout _data_layout{};
QuantizationInfo _quantization_info{};
+ QuantizationInfo _weight_quantization_info{};
bool _is_quantized = false;
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ConvolutionValidationFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+class ConvolutionValidationFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
{
public:
template <typename...>
void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
DataLayout data_layout, ActivationLayerInfo act_info)
{
- ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
- data_type, data_layout,
- QuantizationInfo(), act_info);
+ ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
+ data_type, data_type, data_layout,
+ QuantizationInfo(), QuantizationInfo(), act_info);
}
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class ConvolutionValidationQuantizedFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+class ConvolutionValidationQuantizedFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
{
public:
template <typename...>
void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
{
- ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
- data_type, data_layout, quantization_info, act_info);
+ ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
+ data_type, data_type, data_layout, quantization_info, quantization_info, act_info);
+ }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
+class ConvolutionValidationQuantizedPerChannelFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, TW>
+{
+public:
+ template <typename...>
+ void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+ DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info, DataType weights_data_type)
+ {
+ std::vector<float> weights_scales{};
+ std::mt19937 gen(library->seed());
+ std::uniform_real_distribution<> dis(0.01f, 1);
+ for(size_t i = 0; i < output_shape[2]; ++i)
+ {
+ weights_scales.push_back(dis(gen));
+ }
+ ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, TW>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation,
+ reshape_weights, data_type, weights_data_type, data_layout,
+ quantization_info, QuantizationInfo(weights_scales), act_info);
}
};
} // namespace validation
diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 30be25f50..23918a405 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h
@@ -42,13 +42,16 @@ inline bool is_valid_pixel(int i, int min, int max)
}
// 3D convolution for floating point type
-template < typename T, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TB>::value, int >::type = 0 >
-inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+template < typename T, typename TW, typename TB, typename std::enable_if < validation::is_floating_point<T>::value &&validation::is_floating_point<TW>::value
+ &&validation::is_floating_point<TB>::value,
+ int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
int i_offset, int w_offset, int b_offset, int o_offset,
- int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+ int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1, int filter_id = 0)
{
+ ARM_COMPUTE_UNUSED(filter_id);
const T *in_ptr = in.data() + i_offset;
- const T *w_ptr = weights.data() + w_offset;
+ const TW *w_ptr = weights.data() + w_offset;
const TB *b_ptr = bias.data() + b_offset;
T *out_ptr = out.data() + o_offset;
@@ -77,8 +80,8 @@ inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weig
const int idx = xk + half_width_weights_start;
const int idy = yk + half_height_weights_start;
- const T i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
- const T w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
+ const T i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+ const TW w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
acc += i_value * w_value;
}
@@ -91,13 +94,16 @@ inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weig
}
// 3D convolution for QASYMM8 type
-template < typename T, typename TB, typename std::enable_if < std::is_same<T, uint8_t>::value &&std::is_same<TB, int32_t>::value, int >::type = 0 >
-inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
+template < typename T, typename TW, typename TB, typename std::enable_if < std::is_same<T, uint8_t>::value &&(std::is_same<TW, uint8_t>::value
+ || std::is_same<TW, int8_t>::value)
+ &&std::is_same<TB, int32_t>::value,
+ int >::type = 0 >
+inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &out,
int i_offset, int w_offset, int b_offset, int o_offset,
- int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1)
+ int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int dilation_x = 1, int dilation_y = 1, int filter_id = 0)
{
const T *in_ptr = in.data() + i_offset;
- const T *w_ptr = weights.data() + w_offset;
+ const TW *w_ptr = weights.data() + w_offset;
const TB *b_ptr = bias.data() + b_offset;
T *out_ptr = out.data() + o_offset;
@@ -107,10 +113,22 @@ inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weig
const int input_offset = -iq_info.offset;
const float input_scale = iq_info.scale;
- const int weights_offset = -wq_info.offset;
- const float weights_scale = wq_info.scale;
- const int output_offset = oq_info.offset;
- const float output_scale = oq_info.scale;
+ int weights_offset = -wq_info.offset;
+ float weights_scale = wq_info.scale;
+ if(is_data_type_quantized_per_channel(weights.data_type()))
+ {
+ if(is_data_type_quantized_asymmetric(weights.data_type()))
+ {
+ weights_offset = weights.quantization_info().offset()[filter_id];
+ }
+ else
+ {
+ weights_offset = 0;
+ }
+ weights_scale = weights.quantization_info().scale()[filter_id];
+ }
+ const int output_offset = oq_info.offset;
+ const float output_scale = oq_info.scale;
int output_multiplier = 0;
int output_shift = 0;
@@ -142,9 +160,8 @@ inline void convolution3d(const SimpleTensor<T> &in, const SimpleTensor<T> &weig
const int idx = xk + half_width_weights_start;
const int idy = yk + half_height_weights_start;
- const uint8_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
- const uint8_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
-
+ const int32_t i_value = in_ptr[offset_slice_in + xk * dilation_x + yk * dilation_y * width_in];
+ const int32_t w_value = w_ptr[idx + idy * width_weights + ifm * width_weights * height_weights];
acc += (i_value + input_offset) * (w_value + weights_offset);
}
}
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 69090117f..4d2c1acb6 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -45,8 +45,8 @@ namespace
{
} // namespace
-template <typename T, typename TB>
-SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, SimpleTensor<T> &dst, const PadStrideInfo &info,
const Size2D &dilation, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON((src.shape()[2] / num_groups) != weights.shape()[2]);
@@ -73,7 +73,6 @@ SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleT
const int end_xi = output_wh.first * stride_xi;
const int end_yi = output_wh.second * stride_yi;
const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
-
for(int r = 0; r < num_batches; ++r)
{
for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi)
@@ -100,17 +99,16 @@ SimpleTensor<T> convolution_layer_nchw(const SimpleTensor<T> &src, const SimpleT
offset_in, offset_w, offset_b, offset_out,
xi, yi,
width_in, height_in, (depth_in / num_groups),
- width_weights, height_weights, dilation.x(), dilation.y());
+ width_weights, height_weights, dilation.x(), dilation.y(), ofm);
}
}
}
}
}
-
return dst;
}
-template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info)
{
// if no explicit quantization has been set you the same as src
@@ -123,9 +121,9 @@ SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor
if(src.data_layout() == DataLayout::NHWC)
{
- SimpleTensor<T> src_nchw = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
- SimpleTensor<T> weights_nchw = reference::permute<T>(weights, PermutationVector(1U, 2U, 0U));
- SimpleTensor<T> dst_nchw = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
+ SimpleTensor<T> src_nchw = reference::permute<T>(src, PermutationVector(1U, 2U, 0U));
+ SimpleTensor<TW> weights_nchw = reference::permute<TW>(weights, PermutationVector(1U, 2U, 0U));
+ SimpleTensor<T> dst_nchw = reference::permute<T>(dst, PermutationVector(1U, 2U, 0U));
return reference::permute<T>(convolution_layer_nchw(src_nchw, weights_nchw, bias, dst_nchw, info, dilation, num_groups), PermutationVector(2U, 0U, 1U));
}
@@ -141,6 +139,8 @@ template SimpleTensor<half> convolution_layer(const SimpleTensor<half> &src, con
const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info);
template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<uint8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info);
+template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &src, const SimpleTensor<int8_t> &weights, const SimpleTensor<int32_t> &bias, const TensorShape &output_shape,
+ const PadStrideInfo &info, const Size2D &dilation, unsigned int num_groups, QuantizationInfo out_quant_info);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h
index c51a9b3ad..8f41073fe 100644
--- a/tests/validation/reference/ConvolutionLayer.h
+++ b/tests/validation/reference/ConvolutionLayer.h
@@ -35,8 +35,8 @@ namespace validation
{
namespace reference
{
-template <typename T, typename TB>
-SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<T> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
+template <typename T, typename TW, typename TB>
+SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor<TW> &weights, const SimpleTensor<TB> &bias, const TensorShape &output_shape, const PadStrideInfo &info,
const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, QuantizationInfo out_quant_info = QuantizationInfo());
} // namespace reference
} // namespace validation