15 files changed, 267 insertions, 111 deletions
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 26c2670cbc..b2bdb9a3e7 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,7 @@
 #include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLCrop.h"
 #include "arm_compute/runtime/CL/functions/CLCropResize.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h
index f1a091df84..795a183e1f 100644
--- a/arm_compute/runtime/CL/functions/CLCopy.h
+++ b/arm_compute/runtime/CL/functions/CLCopy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_CLCOPY_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+#include <memory>
 
 namespace arm_compute
 {
@@ -35,32 +35,54 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 
-class CLCopy : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClCopyKernel */
+class CLCopy : public IFunction
 {
 public:
+    /** Constructor */
+    CLCopy();
+    /** Destructor */
+    ~CLCopy();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopy(const CLCopy &) = delete;
+    /** Default move constructor */
+    CLCopy(CLCopy &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopy &operator=(const CLCopy &) = delete;
+    /** Default move assignment operator */
+    CLCopy &operator=(CLCopy &&);
     /** Initialise the function's source and destination.
      *
-     * @param[in]  input  Source tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data types supported: Same as @p input.
-     *
+     * @param[in]  input      Source tensor. Data types supported: All.
+     * @param[out] output     Output tensor. Data types supported: Same as @p input.
+     * @param[in]  dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(ICLTensor *input, ICLTensor *output);
+    void configure(ICLTensor *input, ICLTensor *output, Window *dst_window = nullptr);
     /** Initialise the function's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: All.
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
+     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopy
      *
-     * @param[in] input  Source tensor. Data types supported: All.
-     * @param[in] output Output tensor. Data types supported: Same as @p input.
+     * @param[in] input      Source tensor. Data types supported: All.
+     * @param[in] output     Output tensor. Data types supported: Same as @p input.
+     * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Window *dst_window = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLCOPY_H */
diff --git a/arm_compute/runtime/CL/functions/CLCrop.h b/arm_compute/runtime/CL/functions/CLCrop.h
new file mode 100644
index 0000000000..dc509b5b84
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLCrop.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CROP_H
+#define ARM_COMPUTE_CL_CROP_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+/** Basic function to run @ref opencl::kernels::ClCropKernel */
+class CLCrop : public IFunction
+{
+public:
+    /** Constructor */
+    CLCrop();
+    /** Destructor */
+    ~CLCrop();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCrop(const CLCrop &) = delete;
+    /** Default move constructor */
+    CLCrop(CLCrop &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCrop &operator=(const CLCrop &) = delete;
+    /** Default move assignment operator */
+    CLCrop &operator=(CLCrop &&);
+    /** Configure function
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
+     * @param[out] output              Destination tensor. Data type supported: F32
+     * @param[in]  start               Coordinates of where to start cropping the image.
+     * @param[in]  end                 Coordinates of where to end cropping the image.
+     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p input.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *output_window = nullptr);
+    /** Configure function
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context     The compile context to be used.
+     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
+     * @param[out] output              Destination tensor. Data type supported: F32
+     * @param[in]  start               Coordinates of where to start cropping the image.
+     * @param[in]  end                 Coordinates of where to end cropping the image.
+     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p input.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
+                   Window *output_window = nullptr);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input               Source tensor info. Data type supported: All. Data layouts supported: NHWC.
+     * @param[in] output              Destination tensor info. Data type supported: F32
+     * @param[in] start               Coordinates of where to start cropping the image.
+     * @param[in] end                 Coordinates of where to end cropping the image.
+     * @param[in] batch_index         Fourth dimension index of the 3D image to crop in @p input.
+     * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in] output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
+                           Window *output_window = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_CROP_H */
diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index e781cfe61f..0dc3c48b32 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,9 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLCrop.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 
 #include <cstdint>
@@ -36,8 +39,6 @@ namespace arm_compute
 {
 // Forward Declarations
 class CLCompileContext;
-class CLCopyKernel;
-class CLCropKernel;
 class ITensor;
 class ITensorInfo;
 
@@ -125,12 +126,12 @@ public:
     InterpolationPolicy _method;
     float               _extrapolation_value;
 
-    std::vector<std::unique_ptr<CLScale>>      _scale;
-    std::vector<std::unique_ptr<CLCopyKernel>> _copy;
-    std::vector<std::unique_ptr<CLTensor>>     _crop_results;
-    std::vector<std::unique_ptr<CLTensor>>     _scaled_results;
+    std::vector<std::unique_ptr<CLScale>>  _scale;
+    std::vector<std::unique_ptr<CLCopy>>   _copy;
+    std::vector<std::unique_ptr<CLTensor>> _crop_results;
+    std::vector<std::unique_ptr<CLTensor>> _scaled_results;
 
-    std::vector<std::unique_ptr<ICLKernel>> _internal_kernels;
+    std::vector<std::unique_ptr<IFunction>> _internal_functions;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_CROP_RESIZE_H */
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 3ebc858d32..6c1302fbf7 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
@@ -35,13 +35,12 @@ namespace arm_compute
 // Forward declarations
 class CLDeconvolutionLayerUpsampleKernel;
 class CLCompileContext;
-class CLMemsetKernel;
 class ICLTensor;
 class ITensorInfo;
 
 /** Basic function to execute deconvolution upsample on OpenCL. This function calls the following OpenCL kernels and functions:
  *
- * -# @ref CLMemsetKernel
+ * -# @ref CLFill
  * -# @ref CLDeconvolutionLayerUpsampleKernel
  */
 class CLDeconvolutionLayerUpsample : public IFunction
@@ -90,7 +89,7 @@ public:
 
 private:
     std::unique_ptr<CLDeconvolutionLayerUpsampleKernel> _upsample;
-    std::unique_ptr<CLMemsetKernel>                     _memset;
+    CLFill                                              _fill;
     ICLTensor                                          *_output;
 };
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h
index fef8324432..9a27d158a6 100644
--- a/arm_compute/runtime/CL/functions/CLFill.h
+++ b/arm_compute/runtime/CL/functions/CLFill.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,32 +24,63 @@
 #ifndef ARM_COMPUTE_CLFILL_H
 #define ARM_COMPUTE_CLFILL_H
 
-#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+#include <memory>
 
 namespace arm_compute
 {
 class CLCompileContext;
 class ICLTensor;
 
-/** Function to run @ref CLMemsetKernel to fill a tensor with a scalar value */
-class CLFill : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClFillKernel */
+class CLFill : public IFunction
 {
 public:
-    /** Initialize the function
+    /** Constructor */
+    CLFill();
+    /** Destructor */
+    ~CLFill();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFill(const CLFill &) = delete;
+    /** Default move constructor */
+    CLFill(CLFill &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFill &operator=(const CLFill &) = delete;
+    /** Default move assignment operator */
+    CLFill &operator=(CLFill &&);
+    /** Initialize the kernel's tensor and filling value
      *
-     * @param[in,out] tensor         Source tensor. Data types supported: All.
-     * @param[in]     constant_value Constant value to use to fill tensor.
+     * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
+     * @param[in]     constant_value The value used to fill the planes of the tensor
+     * @param[in]     window         Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(ICLTensor *tensor, PixelValue constant_value);
-    /** Initialize the function
+    void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    /** Initialise the kernel's tensor and filling value
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Source tensor. Data types supported: All.
-     * @param[in]     constant_value  Constant value to use to fill tensor.
+     * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
+     * @param[in]     constant_value  The value used to fill the planes of the tensor
+     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value);
+    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFill
+     *
+     * @param[in] tensor         Source tensor info. Data types supported: All.
+     * @param[in] constant_value The value used to fill the planes of the tensor
+     * @param[in] window         Window to be used in case setting only part of a tensor. Default is nullptr.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLFILL_H */
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index cf5fd500a0..4d6bc66487 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
@@ -42,7 +43,6 @@ class CLBoundingBoxTransformKernel;
 class CLDequantizationLayerKernel;
 class CLComputeAllAnchorsKernel;
 class CLPadLayerKernel;
-class CLPermuteKernel;
 class CLQuantizationLayerKernel;
 class ICLTensor;
 class ITensorInfo;
@@ -137,9 +137,9 @@ private:
     MemoryGroup _memory_group;
 
     // OpenCL kernels
-    std::unique_ptr<CLPermuteKernel>              _permute_deltas_kernel;
+    CLPermute                                     _permute_deltas;
     CLReshapeLayer                                _flatten_deltas;
-    std::unique_ptr<CLPermuteKernel>              _permute_scores_kernel;
+    CLPermute                                     _permute_scores;
     CLReshapeLayer                                _flatten_scores;
     std::unique_ptr<CLComputeAllAnchorsKernel>    _compute_anchors_kernel;
     std::unique_ptr<CLBoundingBoxTransformKernel> _bounding_box_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index 017f26aa1e..20b068316c 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,9 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
@@ -44,8 +46,6 @@
 namespace arm_compute
 {
 class CLCompileContext;
-class CLCopyKernel;
-class CLMemsetKernel;
 class CLTransposeKernel;
 class ICLTensor;
 
@@ -239,14 +239,14 @@ private:
     CLPixelWiseMultiplication          _pixelwise_mul_output_state2;
     CLFullyConnectedLayer              _fully_connected_output_state;
     CLActivationLayer                  _projection_clip;
-    std::unique_ptr<CLCopyKernel>      _copy_cell_state;
-    std::unique_ptr<CLCopyKernel>      _copy_output;
+    CLCopy                             _copy_cell_state;
+    CLCopy                             _copy_output;
     CLConcatenateLayer                 _concat_scratch_buffer;
     CLConcatenateLayer                 _concat_inputs_forget_gate;
     CLConcatenateLayer                 _concat_weights_forget_gate;
     CLConcatenateLayer                 _concat_weights_input_gate;
     CLConcatenateLayer                 _concat_weights_output;
-    std::unique_ptr<CLMemsetKernel>    _ones_memset_kernel;
+    CLFill                             _ones_fill;
     CLMeanStdDevNormalizationLayer     _mean_std_norm_input_gate;
     CLPixelWiseMultiplication          _pixelwise_mul_input_gate_coeff;
     CLArithmeticAddition               _accum_input_gate_bias;
diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
index 693862fb89..24d620d372 100644
--- a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
@@ -35,12 +36,11 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 class CLMaxUnpoolingLayerKernel;
-class CLMemsetKernel;
 struct PoolingLayerInfo;
 
 /** Function to perform MaxUnpooling. This function calls the following OpenCL kernels:
  *
- * -# @ref CLMemsetKernel
+ * -# @ref CLFill
  * -# @ref CLMaxUnpoolingLayerKernel
  */
 class CLMaxUnpoolingLayer : public IFunction
@@ -99,7 +99,7 @@ public:
     void run() override;
 
 private:
-    std::unique_ptr<CLMemsetKernel>            _memset_kernel;
+    CLFill                                     _fill;
     std::unique_ptr<CLMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
 };
 }
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index 2bbde30fc2..dae95f63e6 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,20 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class CLCompileContext;
 class CLPadLayerKernel;
-class CLCopyKernel;
 class ICLTensor;
 
 /** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
  *
  *  -# @ref CLPadLayerKernel if there is padding to be added
- *  -# @ref CLCopyKernel otherwise
+ *  -# @ref CLCopy otherwise
  */
 class CLPadLayer : public IFunction
 {
@@ -100,7 +101,7 @@ private:
     void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
 
     std::unique_ptr<CLPadLayerKernel> _pad_kernel;
-    std::unique_ptr<CLCopyKernel>     _copy_kernel;
+    CLCopy                            _copy;
     bool                              _perform_pad;
 };
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h
index 50e81da7c4..bcd9566fbf 100644
--- a/arm_compute/runtime/CL/functions/CLPermute.h
+++ b/arm_compute/runtime/CL/functions/CLPermute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_CLPERMUTE_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
@@ -35,10 +35,22 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 
-/** Basic function to execute an @ref CLPermuteKernel. */
-class CLPermute : public ICLSimpleFunction
+/** Basic function to execute an @ref opencl::kernels::ClPermuteKernel. */
+class CLPermute : public IFunction
 {
 public:
+    /** Constructor */
+    CLPermute();
+    /** Destructor */
+    ~CLPermute();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPermute(const CLPermute &) = delete;
+    /** Default move constructor */
+    CLPermute(CLPermute &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPermute &operator=(const CLPermute &) = delete;
+    /** Default move assignment operator */
+    CLPermute &operator=(CLPermute &&);
     /** Set the input and output tensors.
      *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
@@ -69,6 +81,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLPERMUTE_H */
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index a8f9221b3d..954f224424 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
@@ -38,7 +39,6 @@ namespace arm_compute
 {
 // Forward declarations
 class CLCompileContext;
-class CLCopyKernel;
 class ICLTensor;
 class CLGEMMLowpMatrixAReductionKernel;
 class CLQLSTMLayerNormalizationKernel;
@@ -49,12 +49,12 @@ class ITensorInfo;
  * This function calls the following CL functions/kernels:
  *
  * -# @ref CLActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref CLCopyKernel                                          Copy kernel for copying output_state_out to output
- * -# @ref CLArithmeticAddition                  Elementwise addition and subtraction
+ * -# @ref CLCopy                                                Copy function for copying output_state_out to output
+ * -# @ref CLArithmeticAddition                                  Elementwise addition and subtraction
  * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
  * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
  * -# @ref CLGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
- * -# @ref CLPixelWiseMultiplication                       Elementwise multiplication
+ * -# @ref CLPixelWiseMultiplication                             Elementwise multiplication
  * -# @ref CLTranspose                                           Transpose function for reshaping the weights
  * */
 class CLQLSTMLayer : public IFunction
@@ -354,7 +354,7 @@ private:
     CLArithmeticAddition                              _accumulate_projection{};
     CLActivationLayer                                 _projection_clip{};
     std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
-    std::unique_ptr<CLCopyKernel> _copy_output;
+    CLCopy _copy_output;
 
     TensorCopyKernel _projection_bias_copy{};
     TensorCopyKernel _projection_output_to_accumulate_copy{};
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index ff3fb5449b..50575daaa3 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
@@ -34,7 +35,6 @@
 
 namespace arm_compute
 {
-class CLCopyKernel;
 class ICLTensor;
 
 /** Basic function to run @ref CLRNNLayer */
@@ -93,16 +93,16 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                   _memory_group;
-    CLGEMM                        _gemm_state_f;
-    CLArithmeticAddition          _add_kernel;
-    CLActivationLayer             _activation;
-    CLFullyConnectedLayer         _fully_connected_kernel;
-    std::unique_ptr<CLCopyKernel> _copy_kernel;
-    CLTensor                      _fully_connected_out;
-    CLTensor                      _gemm_output;
-    CLTensor                      _add_output;
-    bool                          _is_prepared;
+    MemoryGroup           _memory_group;
+    CLGEMM                _gemm_state_f;
+    CLArithmeticAddition  _add_kernel;
+    CLActivationLayer     _activation;
+    CLFullyConnectedLayer _fully_connected_kernel;
+    CLCopy                _copy;
+    CLTensor              _fully_connected_out;
+    CLTensor              _gemm_output;
+    CLTensor              _add_output;
+    bool                  _is_prepared;
 };
 }
 #endif /* ARM_COMPUTE_CLRNN_LAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index b4d52ec8cf..60ed81680e 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include <memory>
 
 namespace arm_compute
 {
@@ -33,7 +34,7 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 
-/** Basic function to run @ref CLReshapeLayerKernel */
+/** Basic function to run @ref opencl::kernels::ClReshapeKernel */
 class CLReshapeLayer : public IFunction
 {
 public:
@@ -79,30 +80,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-
-namespace experimental
-{
-/** Basic function to run @ref CLReshapeLayerKernel */
-class CLReshape : public ICLOperator
-{
-public:
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor info. Data type supported: All
-     * @param[out] output          Output info. Data type supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayer
-     *
-     * @param[in] input  Input tensor info. Data type supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace experimental
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRESHAPELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index 5c5e5bed9a..dc02fa1363 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
@@ -33,14 +34,13 @@
 namespace arm_compute
 {
 class CLCompileContext;
-class CLMemsetKernel;
 class CLSpaceToBatchLayerKernel;
 class ICLTensor;
 class ITensorInfo;
 
 /** Basic function to spatial divide a tensor. This function calls the following OpenCL kernels/functions:
  *
- *  -# @ref CLMemsetKernel
+ *  -# @ref CLFill
  *  -# @ref CLSpaceToBatchLayerKernel
  */
 class CLSpaceToBatchLayer : public IFunction
@@ -125,7 +125,7 @@ public:
 
 private:
     std::unique_ptr<CLSpaceToBatchLayerKernel> _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-    std::unique_ptr<CLMemsetKernel>            _memset_kernel;         /**< Memset kernel to run */
+    CLFill                                     _fill;                  /**< Fill function to run */
     bool                                       _has_padding;           /**< Flag to check if the output has padding */
 };
 } // namespace arm_compute