4 files changed, 35 insertions, 21 deletions
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index f1398eb3cc..bdba42d6ba 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 
 #include "arm_compute/core/Types.h"
@@ -79,6 +80,7 @@ public:
 
     // Inherited methods overridden:
     void run() override;
+    void prepare() override;
 
 private:
     MemoryGroup                _memory_group;
@@ -86,11 +88,11 @@ private:
     NEArithmeticAdditionKernel _add_kernel;
     NEActivationLayerKernel    _activation_kernel;
     NEFullyConnectedLayer      _fully_connected_kernel;
+    NECopyKernel               _copy_kernel;
     Tensor                     _fully_connected_out;
     Tensor                     _gemm_output;
     Tensor                     _add_output;
-    ITensor                   *_hidden_state;
-    ITensor                   *_output;
+    bool                       _is_prepared;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NERNNLAYER_H__ */
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index f1e57c5983..995d5eed86 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -34,8 +34,8 @@
 namespace arm_compute
 {
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _fully_connected_out(), _gemm_output(), _add_output(), _hidden_state(),
-      _output()
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+      _is_prepared(false)
 {
 }
 
@@ -70,23 +70,25 @@ void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const I
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
 
-    _hidden_state = hidden_state;
-    _output       = output;
-
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
     TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
 
+    _is_prepared = false;
+
     // Manage intermediate buffers and configure
     _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+    _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+    // Manage intermediate buffers and configure
     _memory_group.manage(&_fully_connected_out);
     _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
 
-    _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_gemm_output);
     _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
 
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
+
     _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
@@ -94,30 +96,37 @@ void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const I
 
     _activation_kernel.configure(&_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
+
+    _copy_kernel.configure(hidden_state, output);
 }
 
 void NERNNLayer::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     _fully_connected_kernel.run();
+
     _gemm_state_f.run();
+
     NEScheduler::get().schedule(&_add_kernel, Window::DimY);
     NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
 
     // copy hidden out to output
-    Window output_window;
-    output_window.use_tensor_dimensions(_output->info()->tensor_shape(), Window::DimY);
+    NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
 
-    Iterator hidden_state_it(_hidden_state, output_window);
-    Iterator output_it(_output, output_window);
+    _memory_group.release();
+}
 
-    execute_window_loop(output_window, [&](const Coordinates & id)
+void NERNNLayer::prepare()
+{
+    if(!_is_prepared)
     {
-        memcpy(output_it.ptr(), hidden_state_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size());
-    },
-    hidden_state_it, output_it);
+        _fully_connected_kernel.prepare();
+        _gemm_state_f.prepare();
 
-    _memory_group.release();
+        _is_prepared = true;
+    }
 }
 } // namespace arm_compute
diff --git a/tests/datasets/LSTMLayerDataset.h b/tests/datasets/LSTMLayerDataset.h
index a976caa0ba..c21f3208ce 100644
--- a/tests/datasets/LSTMLayerDataset.h
+++ b/tests/datasets/LSTMLayerDataset.h
@@ -160,9 +160,12 @@ class SmallLSTMLayerDataset final : public LSTMLayerDataset
 public:
     SmallLSTMLayerDataset()
     {
-        add_config(TensorShape(8U), TensorShape(8U, 16U), TensorShape(16U, 16U), TensorShape(16U), TensorShape(16U), TensorShape(16U), TensorShape(64U), ActivationLayerInfo(), 0.05f, 0.93f);
-        add_config(TensorShape(8U, 2U), TensorShape(8U, 16U), TensorShape(16U, 16U), TensorShape(16U), TensorShape(16U, 2U), TensorShape(16U, 2U), TensorShape(64U, 2U), ActivationLayerInfo(), 0.05f, 0.93f);
-        add_config(TensorShape(8U, 2U), TensorShape(8U, 16U), TensorShape(16U, 16U), TensorShape(16U), TensorShape(16U, 2U), TensorShape(16U, 2U), TensorShape(48U, 2U), ActivationLayerInfo(), 0.05f, 0.93f);
+        add_config(TensorShape(8U), TensorShape(8U, 16U), TensorShape(16U, 16U), TensorShape(16U), TensorShape(16U), TensorShape(16U), TensorShape(64U),
+                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), 0.05f, 0.93f);
+        add_config(TensorShape(8U, 2U), TensorShape(8U, 16U), TensorShape(16U, 16U), TensorShape(16U), TensorShape(16U, 2U), TensorShape(16U, 2U), TensorShape(64U, 2U),
+                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), 0.05f, 0.93f);
+        add_config(TensorShape(8U, 2U), TensorShape(8U, 16U), TensorShape(16U, 16U), TensorShape(16U), TensorShape(16U, 2U), TensorShape(16U, 2U), TensorShape(48U, 2U),
+                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), 0.05f, 0.93f);
     }
 };
 
diff --git a/tests/datasets/RNNLayerDataset.h b/tests/datasets/RNNLayerDataset.h
index 40d1b934f3..5f42def676 100644
--- a/tests/datasets/RNNLayerDataset.h
+++ b/tests/datasets/RNNLayerDataset.h
@@ -131,7 +131,7 @@ class SmallRNNLayerDataset final : public RNNLayerDataset
 public:
     SmallRNNLayerDataset()
     {
-        add_config(TensorShape(128U, 16U), TensorShape(128U, 32U), TensorShape(32U, 32U), TensorShape(32U), TensorShape(32U, 16U), ActivationLayerInfo());
+        add_config(TensorShape(128U, 16U), TensorShape(128U, 32U), TensorShape(32U, 32U), TensorShape(32U), TensorShape(32U, 16U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
     }
 };