3 files changed, 16 insertions, 7 deletions
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index d2f8a78f87..3751178703 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -34,9 +35,14 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ICLTensor;
 
-/** Basic function to run @ref CLDeconvolutionLayerUpsampleKernel */
+/** Basic function to execute deconvolution upsample on OpenCL. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLMemsetKernel
+ * -# @ref CLDeconvolutionLayerUpsampleKernel
+ */
 class CLDeconvolutionLayerUpsample : public IFunction
 {
 public:
@@ -79,7 +85,8 @@ public:
 
 private:
     CLDeconvolutionLayerUpsampleKernel _upsample;
+    CLMemsetKernel                     _memset;
     ICLTensor                         *_output;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index 936263d635..b9a435abb2 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
@@ -26,10 +26,9 @@
 
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
-
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -70,7 +69,7 @@ class ICLTensor;
  * -# @ref CLConvolutionLayer
  *
  * And the following CPP kernels:
- * -# @ref CPPFlipWeightsKernel
+ * -# @ref CLReverse
  *
  */
 class CLDirectDeconvolutionLayer : public IFunction
@@ -119,11 +118,12 @@ private:
     CLMemoryGroup                _memory_group;
     CLDeconvolutionLayerUpsample _scale_f;
     CLConvolutionLayer           _conv_f;
-    CPPFlipWeightsKernel         _flip_weights;
+    CLReverse                    _flip_weights;
 
     CLTensor   _scaled_output;
     ICLTensor *_original_weights;
     CLTensor   _weights_flipped;
+    CLTensor   _flip_axis;
 
     bool _is_prepared;
 };
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index a804a4af5b..8bd47cbf8e 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
 #include "arm_compute/core/Types.h"
@@ -188,6 +189,7 @@ private:
     CLWidthConcatenate2TensorsKernel     _concat_weights_forget_gate;
     CLWidthConcatenate2TensorsKernel     _concat_weights_input_gate;
     CLWidthConcatenate2TensorsKernel     _concat_weights_output;
+    CLMemsetKernel                       _ones_memset_kernel;
     CLTensor                             _input_gate_out1;
     CLTensor                             _input_gate_out2;
     CLTensor                             _input_gate_out3;