aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2020-08-28 11:18:47 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-09-09 12:01:59 +0000
commitbb88f89b7a12e83eea2fc701f1f82aabf7dfcf7a (patch)
treedc9339328346fc539f45ee2b7b39a0786cadbc3a
parentd64444ba197c2f95dcf4d205f50a196d5a29cdeb (diff)
downloadComputeLibrary-bb88f89b7a12e83eea2fc701f1f82aabf7dfcf7a.tar.gz
COMPMID-3581 Add S32 support to NEPixelWiseMultiplication
* Add S32 support to NEPixelWiseMultiplication and NEPixelWiseMultiplicationKernel * Scale == 1/255 is not supported for S32, as on non-aarch64 the precision requirement is not met, and scale is a non-standard parameter anyway. * Fix the data types validation logics to also test for all invalid data type combinations. * Add validation tests for S32 NEON PixelWiseMultiplication * The wrap tolerance for ScaleOther (scale == 1/2^n) cases is set to 1 instead of 0 because the reference uses floating point division followed by rounding, which is isn't bit accurate. Change-Id: I28839afda7a4f98c985d1763620e08d98f740142 Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3923 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h32
-rw-r--r--arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h46
-rw-r--r--docs/00_introduction.dox2
-rw-r--r--src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp168
-rw-r--r--tests/validation/NEON/PixelWiseMultiplication.cpp31
-rw-r--r--tests/validation/reference/PixelWiseMultiplication.cpp87
6 files changed, 308 insertions, 58 deletions
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index c530d78c42..c65f788091 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -55,27 +55,27 @@ public:
*
* Valid configurations (Input1,Input2) -> Output :
*
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
+ * - (U8,U8) -> U8, S16
* - (U8,S16) -> S16
* - (S16,U8) -> S16
* - (S16,S16) -> S16
+ * - (S32,S32) -> S32
* - (F16,F16) -> F16
* - (F32,F32) -> F32
* - (QASYMM8,QASYMM8) -> QASYMM8
* - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
+ * - (QSYMM16,QSYMM16) -> QSYMM16, S32
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
- * @param[in] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+ * @param[in] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
*/
void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
@@ -83,27 +83,27 @@ public:
*
* Valid configurations (Input1,Input2) -> Output :
*
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
+ * - (U8,U8) -> U8, S16
* - (U8,S16) -> S16
* - (S16,U8) -> S16
* - (S16,S16) -> S16
+ * - (S32,S32) -> S32
* - (F16,F16) -> F16
* - (F32,F32) -> F32
* - (QASYMM8,QASYMM8) -> QASYMM8
* - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
+ * - (QSYMM16,QSYMM16) -> QSYMM16, S32
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
- * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
- * @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+ * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
*
* @return a status
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 3c1aa5220c..4ff7f1d112 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -42,29 +42,29 @@ public:
*
* Valid configurations (Input1,Input2) -> Output :
*
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
+ * - (U8,U8) -> U8, S16
* - (U8,S16) -> S16
* - (S16,U8) -> S16
* - (S16,S16) -> S16
+ * - (S32,S32) -> S32
* - (F16,F16) -> F16
* - (F32,F32) -> F32
* - (QASYMM8,QASYMM8) -> QASYMM8
* - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
+ * - (QSYMM16,QSYMM16) -> QSYMM16, S32
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in, out] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+ * @param[in, out] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+ * @param[in, out] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
* @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
@@ -74,27 +74,27 @@ public:
*
* Valid configurations (Input1,Input2) -> Output :
*
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
+ * - (U8,U8) -> U8, S16
* - (U8,S16) -> S16
* - (S16,U8) -> S16
* - (S16,S16) -> S16
+ * - (S32,S32) -> S32
* - (F16,F16) -> F16
* - (F32,F32) -> F32
* - (QASYMM8,QASYMM8) -> QASYMM8
* - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- * - (QSYMM16,QSYMM16) -> S32
+ * - (QSYMM16,QSYMM16) -> QSYMM16, S32
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
- * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+ * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*
@@ -150,9 +150,9 @@ public:
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in, out] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+ * @param[in, out] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * @param[in, out] input2 An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
* This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
* @param[out] output Output tensor. Data types supported:
* - U8, only if both inputs are U8.
@@ -160,12 +160,13 @@ public:
* - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
* - S16.
* - QSYMM16, only if both inputs are QSYMM16.
- * - S32, only if both inputs are QSYMM16.
+ * - S32, only if both inputs are S32 or both are QSYMM16.
* - F16, only if @p input1 is F16.
* - F32, only if both inputs are F32.
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
@@ -176,20 +177,21 @@ public:
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
- * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] input2 An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
* @param[in] output Output tensor info. Data types supported:
* - U8, only if both inputs are U8.
* - QASYMM8, only if both inputs are QASYMM8.
* - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
* - S16.
* - QSYMM16, only if both inputs are QSYMM16.
- * - S32, only if both inputs are QSYMM16.
+ * - S32, only if both inputs are S32 or both are QSYMM16.
* - F16, only if @p input1 is F16.
* - F32, only if both inputs are F32.
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index f8f07906a8..bfe5799362 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -241,6 +241,8 @@ v20.11 Public major release
- Added new data type S32 support for:
- @ref NEArithmeticSubtraction
- @ref NEArithmeticSubtractionKernel
+ - @ref NEPixelWiseMultiplication
+ - @ref NEPixelWiseMultiplicationKernel
- Interface change
- Properly support softmax axis to have the same meaning as other major frameworks. That is, axis now defines the dimension
on which Softmax/Logsoftmax is performed. E.g. for input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 907a7f197b..302ee7694f 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -49,8 +49,10 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
ARM_COMPUTE_UNUSED(rounding_policy);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
+ DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
+ DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
DataType::S16, DataType::QSYMM16,
DataType::S32, DataType::F16, DataType::F32);
@@ -65,23 +67,24 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
- "Output can only be U8 if both inputs are U8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
- "Output can only be QASYMM8 if both inputs are QASYMM8");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
- "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
- "Output can only be QSYMM16 if both inputs are QSYMM16");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
- "Output can only be S32 if both inputs are QSYMM16");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
+ // clang-format off
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(input1->data_type() == input2->data_type() && input2->data_type() == output->data_type()) &&
+ !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
+ !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16) &&
+ !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
+ !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
+ !(input1->data_type() == DataType::QSYMM16 && input2->data_type() == DataType::QSYMM16 && output->data_type() == DataType::S32)
+ , "Invalid data type combination");
+ // clang-format on
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S16 && output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
}
if(std::abs(scale - scale255_constant) < 0.00001f)
{
ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S32 && input2->data_type() == DataType::S32 && output->data_type() == DataType::S32,
+ "Scale == 1/255 is not supported if input and output are of data type S32");
}
else
{
@@ -710,6 +713,137 @@ void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
input1, input2, output);
}
+template <bool is_sat>
+inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &input1, const int32x4_t &input2, int n)
+{
+ const int32x2_t input1_1 = vget_low_s32(input1);
+ const int32x2_t input2_1 = vget_low_s32(input2);
+ const int32x2_t input1_2 = vget_high_s32(input1);
+ const int32x2_t input2_2 = vget_high_s32(input2);
+
+ int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
+ int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
+
+ // Apply scaling, conversion and rounding (round to zero)
+ // Right shift amount
+ const int64x2_t vn = vdupq_n_s64(-n);
+ // Left shift amount
+ const int64x2_t vnl = vdupq_n_s64(n);
+ // Calculate conversion bit
+ const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
+ const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
+ const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
+ const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
+
+ const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
+ const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
+ const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
+ const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
+ if(is_sat)
+ {
+ tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
+ tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
+ return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
+ }
+ else
+ {
+ tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
+ tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
+ return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
+ }
+}
+
+template <bool is_sat>
+inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &input1, const int32x4x2_t &input2, int n)
+{
+ const int32x4x2_t result =
+ {
+ {
+ // First 4 elements
+ mul_S32_S32_S32_n_loop<is_sat>(input1.val[0], input2.val[0], n),
+ // Second 4 elements
+ mul_S32_S32_S32_n_loop<is_sat>(input1.val[1], input2.val[1], n)
+ }
+ };
+
+ return result;
+}
+
+template <bool is_sat>
+void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+{
+ // Create input windows
+ Window win = window;
+ Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+ // Clear X Dimension on execution window as we handle manually
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+ input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input1(in1, input1_win);
+ Iterator input2(in2, input2_win);
+ Iterator output(out, win);
+
+ const int window_step_x = 8;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
+ const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const int32x4x2_t ta1 =
+ {
+ {
+ vld1q_s32(input1_ptr + x),
+ vld1q_s32(input1_ptr + x + 4),
+ }
+ };
+ const int32x4x2_t ta2 =
+ {
+ {
+ vld1q_s32(input2_ptr + x),
+ vld1q_s32(input2_ptr + x + 4),
+ }
+ };
+ const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
+
+ vst1q_s32(output_ptr + x, result.val[0]);
+ vst1q_s32(output_ptr + x + 4, result.val[1]);
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
+
+ if(tmp >= 0)
+ {
+ tmp >>= n;
+ }
+ else
+ {
+ uint64_t mask = (1u << n) - 1;
+ tmp = (tmp + static_cast<int64_t>(mask)) >> n;
+ }
+ if(is_sat)
+ {
+ tmp = (tmp > INT_MAX) ? INT_MAX : ((tmp < INT_MIN) ? INT_MIN : tmp);
+ }
+ *(output_ptr + x) = static_cast<int32_t>(tmp);
+ }
+ },
+ input1, input2, output);
+}
+
void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
{
// Create input windows
@@ -1200,6 +1334,12 @@ void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo
}
}
break;
+ case DataType::S32:
+ if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
+ {
+ _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
+ }
+ break;
case DataType::U8:
if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
{
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 0b88628912..a66f6f192f 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -111,6 +111,8 @@ using NEPixelWiseMultiplicationToU8Fixture = PixelWiseMultiplicationValidationFi
template <typename T>
using NEPixelWiseMultiplicationToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int16_t>;
template <typename T>
+using NEPixelWiseMultiplicationToS32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, int32_t>;
+template <typename T>
using NEPixelWiseMultiplicationToF16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, half_float::half>;
template <typename T>
using NEPixelWiseMultiplicationToF32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
@@ -139,6 +141,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), //11 Mismatching data type
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8), //12 Ok
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED), //13 Quantized cannot do WRAP
+ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32), //14 S32 does not support scale255
}),
framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -153,6 +156,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
})),
framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
@@ -160,13 +164,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
+ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
})),
framework::dataset::make("Scale",{ scale_unity,
scale_unity,
@@ -180,7 +185,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
scale_unity,
scale_unity,
scale_unity,
- scale_unity})),
+ scale_unity,
+ scale_255})),
framework::dataset::make("OverflowPolicy",{
ConvertPolicy::WRAP,
ConvertPolicy::WRAP,
@@ -195,9 +201,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
ConvertPolicy::WRAP,
ConvertPolicy::SATURATE,
ConvertPolicy::WRAP,
+ ConvertPolicy::SATURATE,
})),
- framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false })),
+ framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false, false})),
input1_info, input2_info, output_info, scale, policy, expected)
{
bool has_error = bool(NEPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, policy, RoundingPolicy::TO_ZERO));
@@ -260,7 +267,7 @@ TEST_SUITE_END() // InPlaceValidate
TEST_SUITE(Quantized)
TEST_SUITE(QASYMM8_SIGNED)
-TEST_SUITE(Scale255)
+TEST_SUITE(ScaleUnity)
FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
@@ -273,8 +280,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture,
// Validate output
validate(Accessor(_target), _reference, tolerance_qasymm8);
}
-TEST_SUITE_END() // Scale255
-TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // ScaleUnity
+TEST_SUITE_END() // QASYMM8_SIGNED
TEST_SUITE(QASYMM8)
TEST_SUITE(Scale255)
@@ -476,6 +483,18 @@ TEST_SUITE_END() // ScaleOther
TEST_SUITE_END() // S16toS16
+TEST_SUITE(S32toS32)
+
+TEST_SUITE(ScaleUnity)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_unity, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleUnity
+
+TEST_SUITE(ScaleOther)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS32Fixture<int32_t>, ALL, SmallShapes(), S32, S32, S32, scale_other, TO_ZERO, InPlaceDataSet, WRAP_VALIDATE(int32_t, 1))
+TEST_SUITE_END() // ScaleOther
+
+TEST_SUITE_END() // S32toS32
+
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
TEST_SUITE(F16toF16)
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 9f70b1c2af..0450991f61 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -43,6 +43,8 @@ struct is_floating_point
namespace
{
+constexpr float scale1_constant = 1.f;
+
/** Compute the result of `src1 * src2 * scale`. The result type always matches the type of @p src2.
*
* @param[in] src1 An input value. Data types supported: U8/S16/F16/F32.
@@ -89,6 +91,90 @@ T3 mul(const T1 src1, const T2 src2, float scale, ConvertPolicy convert_policy,
}
}
+template <>
+int32_t mul(const int32_t src1, const int32_t src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+ const int64_t intermediate_val = static_cast<int64_t>(src1) * static_cast<int64_t>(src2);
+
+ if(std::abs(scale - scale1_constant) < 0.00001f)
+ {
+ // Use bit-accurate integer arithmetic for scale == 1
+ // Apply conversion
+ if(convert_policy == ConvertPolicy::SATURATE)
+ {
+ return saturate_cast<int32_t>(intermediate_val);
+ }
+ else
+ {
+ // Correct wrapping behaviour for int32_t
+ const auto i32_hi = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+ const auto i32_lo = static_cast<int64_t>(std::numeric_limits<int32_t>::lowest());
+ const auto i32_wi = static_cast<int64_t>(1) << 32;
+ int64_t wrapped_rounded_val = intermediate_val - i32_wi * static_cast<int64_t>(support::cpp11::trunc(static_cast<double>(intermediate_val) / i32_wi));
+ if(wrapped_rounded_val <= i32_hi)
+ {
+ return static_cast<int32_t>(wrapped_rounded_val);
+ }
+ else
+ {
+ // Values beyond i32_hi wrap around to negatives
+ return static_cast<int32_t>((wrapped_rounded_val - i32_hi) + i32_lo - 1);
+ }
+ }
+ }
+ else
+ {
+ // Use double arithmetic for scale != 1; may not be bit-accurate
+ // Apply scaling
+ // scale == 1 / 2^scale_exponent
+ int scale_exponent = 0;
+ std::frexp(scale, &scale_exponent);
+ // Store the positive exponent. We know that we compute 1/2^n
+ // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+ scale_exponent = std::abs(scale_exponent - 1);
+ const double scale_inv = static_cast<int64_t>(1) << scale_exponent;
+ const double val = intermediate_val / scale_inv;
+ // Apply rounding
+ double rounded_val = 0;
+ switch(rounding_policy)
+ {
+ case(RoundingPolicy::TO_ZERO):
+ rounded_val = support::cpp11::trunc(val);
+ break;
+ case(RoundingPolicy::TO_NEAREST_UP):
+ rounded_val = round_half_up(val);
+ break;
+ case(RoundingPolicy::TO_NEAREST_EVEN):
+ rounded_val = round_half_even(val);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported rounding policy");
+ }
+ // Apply conversion
+ if(convert_policy == ConvertPolicy::SATURATE)
+ {
+ return saturate_cast<int32_t>(rounded_val);
+ }
+ else
+ {
+ // Correct wrapping behaviour for int32_t
+ const auto i32_hi = static_cast<double>(std::numeric_limits<int32_t>::max());
+ const auto i32_lo = static_cast<double>(std::numeric_limits<int32_t>::lowest());
+ const auto i32_wi = static_cast<double>(static_cast<int64_t>(1) << 32);
+ double wrapped_rounded_val = rounded_val - i32_wi * std::floor(rounded_val / i32_wi);
+ if(wrapped_rounded_val <= i32_hi)
+ {
+ return static_cast<int32_t>(wrapped_rounded_val);
+ }
+ else
+ {
+ // Values beyond i32_hi wrap around to negatives
+ return static_cast<int32_t>((wrapped_rounded_val - i32_hi) + i32_lo - 1);
+ }
+ }
+ }
+}
+
template <size_t dim>
struct BroadcastUnroll
{
@@ -264,6 +350,7 @@ SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src
// clang-format off
template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
template SimpleTensor<int32_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
+template SimpleTensor<int32_t> pixel_wise_multiplication(const SimpleTensor<int32_t> &src1, const SimpleTensor<int32_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
template SimpleTensor<float> pixel_wise_multiplication(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
template SimpleTensor<half_float::half> pixel_wise_multiplication(const SimpleTensor<half_float::half> &src1, const SimpleTensor<half_float::half> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
// clang-format on