Make Cpu/Gpu/Ref scalar/vectoral S32 division consistent

- Neon(TM) implementation converts integers to float and performs the division because there is no vector integer division instructions. However, leftover loop still uses integer division, which makes results inconsistent depending on where we are in the tensor. - SVE path does it in integer domain. - OpenCL(TM) does it similar to Neon(TM) vector path. - Reference implementation does it in integer domain. These differences cause intermittent mismatches. This patch ensures all follow the same logic. On the other hand, the provided Neon(TM) implementation is faster than the Fp32 converted version. Resolves: COMPMID-6925 Change-Id: Ia12606d57f40a7d331b9b698f87fd4321496b275 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11316 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2024-03-19 22:10:04 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2024-03-20 12:21:39 +0000
commit: d2191150736dde66d79eb97e0c8ee506eef3c8fc (patch)
tree: 7b3dd39db3513bc11f4da9f47508b14445e5cf1f
parent: 1618e956130ddb0cb69acd56595d1a959e1db513 (diff)
download: ComputeLibrary-d2191150736dde66d79eb97e0c8ee506eef3c8fc.tar.gz
4 files changed, 18 insertions, 31 deletions
diff --git a/src/core/CL/cl_kernels/common/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl
index 45dcbfc6e2..91e51d9d1a 100644
--- a/src/core/CL/cl_kernels/common/elementwise_operation.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,11 +46,7 @@
 #define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))))
 #endif // VEC_SIZE_OUT == 1
 
-#if defined(S32)
-#define DIV(x, y) CONVERT(floor(CONVERT(x, VEC_DATA_TYPE(float, VEC_SIZE_OUT)) / CONVERT(y, VEC_DATA_TYPE(float, VEC_SIZE_OUT))), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
-#else /* S32 */
 #define DIV(x, y) (x / y)
-#endif /* S32 */
 
 #define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
 #define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 98f7e8b949..78e3baf74b 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
 
 #include "src/core/NEON/NEAsymm.h"
 
@@ -198,14 +198,6 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
         case ArithmeticOperation::DIV:
         {
             res = a / b;
-            if (std::is_integral<ScalarType>::value)
-            {
-                res = (b == 0) ? 0 : res;
-                if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
-                {
-                    --res;
-                }
-            }
             break;
         }
         case ArithmeticOperation::POWER:
@@ -224,7 +216,15 @@ inline int32x4_t
 elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,
                                                                                                    const int32x4_t &b)
 {
-    return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
+    int32x4_t result;
+
+    // Neon(TM) does not have vector integer division
+    result[0] = a[0] / b[0];
+    result[1] = a[1] / b[1];
+    result[2] = a[2] / b[2];
+    result[3] = a[3] / b[3];
+
+    return result;
 }
 
 template <>
@@ -1313,4 +1313,4 @@ void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H */
+#endif // ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
diff --git a/tests/validation/NEON/ElementwiseDivision.cpp b/tests/validation/NEON/ElementwiseDivision.cpp
index 5f0224c91d..95db4ad5fd 100644
--- a/tests/validation/NEON/ElementwiseDivision.cpp
+++ b/tests/validation/NEON/ElementwiseDivision.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@ namespace validation
 namespace
 {
 RelativeTolerance<float> tolerance_fp32(0.000001f);
-AbsoluteTolerance<int>   tolerance_zero_s32(1); // Tolerance for S32 division
+AbsoluteTolerance<int>   tolerance_zero_s32(0); // Tolerance for S32 division
 
 /** Input data sets **/
 const auto ElementwiseDivisionS32Dataset = combine(combine(framework::dataset::make("DataType", DataType::S32),
@@ -177,7 +177,7 @@ TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
 TEST_SUITE_END() // ElementwiseDivision
-TEST_SUITE_END() // Neon
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/reference/ElementwiseOperations.cpp b/tests/validation/reference/ElementwiseOperations.cpp
index f22c84e153..edbbab8600 100644
--- a/tests/validation/reference/ElementwiseOperations.cpp
+++ b/tests/validation/reference/ElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,15 +74,6 @@ T arithm_op(ArithmeticOperation op, T src1, T src2, ConvertPolicy convert_policy
         case ArithmeticOperation::DIV:
         {
             val = (static_cast<intermediate_type>(src1) / static_cast<intermediate_type>(src2));
-            if(std::is_integral<T>::value)
-            {
-                // Implement flooring division
-                val = (src2 == 0) ? 0 : val;
-                if(static_cast<int32_t>(src1) % static_cast<int32_t>(src2) != 0 && ((src1 < 0) != (src2 < 0)))
-                {
-                    --val;
-                }
-            }
             break;
         }
         case ArithmeticOperation::POWER:
author	Gunes Bayir <gunes.bayir@arm.com>	2024-03-19 22:10:04 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2024-03-20 12:21:39 +0000
commit	d2191150736dde66d79eb97e0c8ee506eef3c8fc (patch)
tree	7b3dd39db3513bc11f4da9f47508b14445e5cf1f
parent	1618e956130ddb0cb69acd56595d1a959e1db513 (diff)
download	ComputeLibrary-d2191150736dde66d79eb97e0c8ee506eef3c8fc.tar.gz