From ad7515d231acb075a9585e52f257373b1a1b5d1f Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Fri, 24 Jul 2020 00:02:23 +0100
Subject: COMPMID-3385: Async support to CLArithmetic* kernels/functions Pt.1

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: I94007565e688f8a0aead4f14c9fc30bfd9f9f7eb
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3613
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 src/runtime/CL/functions/CLLSTMLayer.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'src/runtime/CL/functions/CLLSTMLayer.cpp')

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 1b46baaf5e..a1c412401b 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -155,7 +155,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
                                                    RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
@@ -173,7 +173,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(compile_context, ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -222,7 +222,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
                                                       RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
@@ -246,7 +246,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
     if(_is_layer_norm_lstm)
     {
@@ -259,7 +259,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
                                                  RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
@@ -268,7 +268,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
     _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
@@ -329,7 +329,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
                                                    RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
@@ -538,7 +538,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
@@ -636,14 +636,14 @@ void CLLSTMLayer::run()
     {
         _mean_std_norm_forget_gate.run();
         CLScheduler::get().enqueue(_pixelwise_mul_forget_gate_coeff);
-        CLScheduler::get().enqueue(_accum_forget_gate_bias);
+        _accum_forget_gate_bias.run();
     }
     _activation_forget_gate.run();
 
     if(_run_cifg_opt)
     {
         CLScheduler::get().enqueue(_ones_memset_kernel);
-        CLScheduler::get().enqueue(_subtract_input_gate);
+        _subtract_input_gate.run();
     }
     else
     {
@@ -659,7 +659,7 @@ void CLLSTMLayer::run()
         {
             _mean_std_norm_input_gate.run();
             CLScheduler::get().enqueue(_pixelwise_mul_input_gate_coeff);
-            CLScheduler::get().enqueue(_accum_input_gate_bias);
+            _accum_input_gate_bias.run();
         }
         _activation_input_gate.run();
     }
@@ -667,17 +667,17 @@ void CLLSTMLayer::run()
     _fully_connected_cell_state.run();
     CLScheduler::get().enqueue(_transpose_cell_state);
     _gemm_cell_state1.run();
-    CLScheduler::get().enqueue(_accum_cell_state1);
+    _accum_cell_state1.run();
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         CLScheduler::get().enqueue(_pixelwise_mul_cell_gate_coeff);
-        CLScheduler::get().enqueue(_accum_cell_gate_bias);
+        _accum_cell_gate_bias.run();
     }
     _activation_cell_state.run();
     CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
     CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
-    CLScheduler::get().enqueue(_accum_cell_state2);
+    _accum_cell_state2.run();
 
     if(_perform_cell_clipping)
     {
@@ -695,7 +695,7 @@ void CLLSTMLayer::run()
     {
         _mean_std_norm_output_gate.run();
         CLScheduler::get().enqueue(_pixelwise_mul_output_gate_coeff);
-        CLScheduler::get().enqueue(_accum_output_gate_bias);
+        _accum_output_gate_bias.run();
     }
     _activation_output.run();
 
-- 
cgit v1.2.1