From a387e271b1e02ffd5c2993702b9a21c1ed5c95fa Mon Sep 17 00:00:00 2001
From: Sheri Zhang <sheri.zhang@arm.com>
Date: Tue, 29 Jun 2021 17:34:06 +0100
Subject: Add in-place calculation support for CL elementwise arithmetic
 kernels

- Add in-place calculation support in ClArithmeticKernel,  ClSaturatedArithmeticKernel and ClMulKernel
- Add in-place test cases

Resolves: COMPMID-4431

Signed-off-by: Sheri Zhang <sheri.zhang@arm.com>
Change-Id: Id484bdb76b74478a33fedb471ae0c7f799c599f6
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5885
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../fixtures/PixelWiseMultiplicationFixture.h      | 44 ++++++++++++++--------
 1 file changed, 28 insertions(+), 16 deletions(-)

(limited to 'tests/validation/fixtures/PixelWiseMultiplicationFixture.h')
diff --git a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
index 7c643bd726..8dc5179109 100644
--- a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
+++ b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
@@ -76,9 +76,29 @@ protected:
                               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info)
     {
         // Create tensors
-        TensorType src1 = create_tensor<TensorType>(shape0, dt_in1, 1, qinfo0);
-        TensorType src2 = create_tensor<TensorType>(shape1, dt_in2, 1, qinfo1);
-        TensorType dst  = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), dt_out, 1, qinfo_out);
+        const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
+        TensorType        src1      = create_tensor<TensorType>(shape0, dt_in1, 1, qinfo0);
+        TensorType        src2      = create_tensor<TensorType>(shape1, dt_in2, 1, qinfo1);
+        TensorType        dst       = create_tensor<TensorType>(out_shape, dt_out, 1, qinfo_out);
+
+        // Check whether do in-place computation and whether inputs are broadcast compatible
+        TensorType *actual_dst = &dst;
+        if(_is_inplace)
+        {
+            bool src1_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out) && (dt_in1 == dt_out);
+            bool src2_is_inplace = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out) && (dt_in2 == dt_out);
+            bool do_in_place     = out_shape.total_size() != 0 && (src1_is_inplace || src2_is_inplace);
+            ARM_COMPUTE_ASSERT(do_in_place);
+
+            if(src1_is_inplace)
+            {
+                actual_dst = &src1;
+            }
+            else
+            {
+                actual_dst = &src2;
+            }
+        }
 
         auto allocate_tensor = [](TensorType & t)
         {
@@ -89,11 +109,12 @@ protected:
 
         // Create and configure function
         FunctionType multiply;
-        multiply.configure(&src1, &src2, (_is_inplace ? &src1 : &dst), scale, convert_policy, rounding_policy, act_info);
+        multiply.configure(&src1, &src2, actual_dst, scale, convert_policy, rounding_policy, act_info);
 
         allocate_tensor(src1);
         allocate_tensor(src2);
 
+        // If don't do in-place computation, still need to allocate original dst
         if(!_is_inplace)
         {
             allocate_tensor(dst);
@@ -106,12 +127,7 @@ protected:
         // Compute function
         multiply.run();
 
-        if(_is_inplace)
-        {
-            return src1;
-        }
-
-        return dst;
+        return std::move(*actual_dst);
     }
 
     SimpleTensor<T3> compute_reference(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, DataType dt_out,
@@ -122,16 +138,12 @@ protected:
         SimpleTensor<T1> src1{ shape0, dt_in1, 1, qinfo0 };
         SimpleTensor<T2> src2{ shape1, dt_in2, 1, qinfo1 };
 
-        // current in-place implementation only supports same metadata of input and output tensors.
-        // By ignoring output quantization information here, we can make test cases implementation much simpler.
-        QuantizationInfo output_qinfo = _is_inplace ? qinfo0 : qinfo_out;
-
         // Fill reference
         fill(src1, 0);
         fill(src2, 1);
 
-        auto result = reference::pixel_wise_multiplication<T1, T2, T3>(src1, src2, scale, convert_policy, rounding_policy, dt_out, output_qinfo);
-        return act_info.enabled() ? reference::activation_layer(result, act_info, output_qinfo) : result;
+        auto result = reference::pixel_wise_multiplication<T1, T2, T3>(src1, src2, scale, convert_policy, rounding_policy, dt_out, qinfo_out);
+        return act_info.enabled() ? reference::activation_layer(result, act_info, qinfo_out) : result;
     }
 
     TensorType       _target{};
-- 
cgit v1.2.1