From f20d6d6ae5a0da2c856294e93341cdc065db58f9 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 16 Jul 2020 17:46:51 +0100
Subject: COMPMID-3390: Async support to CLStridedSliceLayerKernel
 kernels/functions

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: I9ff7e8d2fb4d36c4b7c44e885abf34ff6d4c577c
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3587
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/CL/kernels/CLStridedSliceKernel.h | 45 ++-----------
 arm_compute/runtime/CL/functions/CLSlice.h         | 66 ++++++++++++++++++-
 arm_compute/runtime/CL/functions/CLStridedSlice.h  | 76 +++++++++++++++++++++-
 src/core/CL/kernels/CLStridedSliceKernel.cpp       | 49 ++++++--------
 src/runtime/CL/functions/CLSlice.cpp               | 53 +++++++++++++--
 src/runtime/CL/functions/CLStridedSlice.cpp        | 63 ++++++++++++++++--
 6 files changed, 269 insertions(+), 83 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
index ff3b0697a5..28a665b113 100644
--- a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
@@ -31,49 +31,17 @@
 
 namespace arm_compute
 {
-// Forward declarations
-class ICLTensor;
-
 /** Interface for the kernel to perform tensor strided slicing */
 class CLStridedSliceKernel : public ICLKernel
 {
 public:
-    /** Default constructor */
-    CLStridedSliceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
-    /** Default destructor */
-    ~CLStridedSliceKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
+     * @param[in]  input            Source tensor info. Data type supported: All.
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
      * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -82,7 +50,7 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
                    const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                    int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
 
@@ -105,11 +73,8 @@ public:
                            int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs,
+                const Window &window, cl::CommandQueue &queue) override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H */
diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h
index 9f9591e4de..6fe62acaf5 100644
--- a/arm_compute/runtime/CL/functions/CLSlice.h
+++ b/arm_compute/runtime/CL/functions/CLSlice.h
@@ -24,17 +24,72 @@
 #ifndef ARM_COMPUTE_CL_SLICE_H
 #define ARM_COMPUTE_CL_SLICE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 // Forward Declarations
 class ICLTensor;
 
+namespace experimental
+{
 /** Basic function to perform tensor slicing */
-class CLSlice : public ICLSimpleFunction
+class CLSlice : public ICLOperator
 {
 public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor info. Data type supported: All.
+     * @param[out] output          Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     *
+     * @return A status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
+};
+} // namespace experimental
+
+/** Basic function to perform tensor slicing */
+class CLSlice : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLSlice();
+    /** Default Destructor */
+    ~CLSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSlice(const CLSlice &) = delete;
+    /** Default move constructor */
+    CLSlice(CLSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSlice &operator=(const CLSlice &) = delete;
+    /** Default move assignment operator */
+    CLSlice &operator=(CLSlice &&);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -78,6 +133,13 @@ public:
      * @return A status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_SLICE_H */
diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
index 98a3bd49d3..394d8c4f59 100644
--- a/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -24,7 +24,9 @@
 #ifndef ARM_COMPUTE_CL_STRIDED_SLICE_H
 #define ARM_COMPUTE_CL_STRIDED_SLICE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -32,9 +34,24 @@ namespace arm_compute
 class ICLTensor;
 
 /** Basic function to run @ref CLStridedSliceKernel */
-class CLStridedSlice : public ICLSimpleFunction
+class CLStridedSlice : public IFunction
 {
 public:
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    CLStridedSlice(CLRuntimeContext *ctx = nullptr);
+    /** Destructor */
+    ~CLStridedSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStridedSlice(const CLStridedSlice &) = delete;
+    /** Default move constructor */
+    CLStridedSlice(CLStridedSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStridedSlice &operator=(const CLStridedSlice &) = delete;
+    /** Default move assignment operator */
+    CLStridedSlice &operator=(CLStridedSlice &&);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -88,6 +105,61 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output,
                            const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                            int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+/** Basic function to run @ref CLStridedSliceKernel */
+class CLStridedSlice : public ICLOperator
+{
+public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor info. Data type supported: All.
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
+                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input            Source tensor info. Data type supported: All.
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_STRIDED_SLICE_H */
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index c1f4c4d1b5..94cbd43bb1 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/bit_ops.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/StringSupport.h"
 
@@ -66,7 +67,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
                                                         const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                                         int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
@@ -84,29 +85,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLStridedSliceKernel::CLStridedSliceKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
-}
-
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
-    _input  = input;
-    _output = output;
-
-    const TensorShape &input_shape = input->info()->tensor_shape();
+    const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates starts_abs;
     Coordinates ends_abs;
@@ -117,12 +103,12 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
                                                         begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
-    const int  vec_size_x     = 16 / input->info()->element_size();
-    const int  output_width_x = output->info()->tensor_shape().x();
+    const int  vec_size_x     = 16 / input->element_size();
+    const int  output_width_x = output->tensor_shape().x();
     const bool is_shrink_on_x = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
     const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
@@ -137,7 +123,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
     for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
@@ -150,8 +136,8 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
                                   "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
                                   "-DSRC_DEPTH=1");
-    build_opts.add_option_if_else(_output->info()->num_dimensions() > 2,
-                                  "-DDST_DEPTH=" + support::cpp11::to_string(_output->info()->tensor_shape().z()),
+    build_opts.add_option_if_else(output->num_dimensions() > 2,
+                                  "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
                                   "-DDST_DEPTH=1");
 
     // Create kernel
@@ -160,11 +146,11 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     // Set config_id for enabling LWS tuning
     _config_id = "strided_slice";
     _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += lower_string(string_from_data_type(input->data_type()));
     for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         _config_id += "_";
-        _config_id += support::cpp11::to_string(input->info()->dimension(i));
+        _config_id += support::cpp11::to_string(input->dimension(i));
         _config_id += "_";
         _config_id += support::cpp11::to_string(starts_abs[i]);
         _config_id += "_";
@@ -186,19 +172,22 @@ Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInf
     return Status{};
 }
 
-void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLStridedSliceKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(inputs.at(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(outputs.at(TensorType::ACL_DST));
+
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_4D();
 
     do
     {
         unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, slice);
-        add_4D_tensor_argument(idx, _output, slice);
+        add_4D_tensor_argument(idx, src, slice);
+        add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window_collapsed.slide_window_slice_4D(slice));
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index 129d97afeb..b60daeee44 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -31,12 +31,9 @@
 
 namespace arm_compute
 {
-void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
-}
-
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -63,4 +60,50 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
 
     return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
+
+MemoryRequirements CLSlice::workspace() const
+{
+    return MemoryRequirements{};
+}
+} // namespace experimental
+
+struct CLSlice::Impl
+{
+    const ICLTensor                       *src{ nullptr };
+    ICLTensor                             *dst{ nullptr };
+    std::unique_ptr<experimental::CLSlice> op{ nullptr };
+};
+
+CLSlice::CLSlice()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice &CLSlice::operator=(CLSlice &&) = default;
+CLSlice::~CLSlice()                     = default;
+
+Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+    return experimental::CLSlice::validate(input, output, starts, ends);
+}
+
+void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
+}
+
+void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLSlice>();
+    _impl->op->configure(compile_context, input->info(), output->info(), starts, ends);
+}
+
+void CLSlice::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC, _impl->src } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index fc011ceaf7..d1b16700ff 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -23,12 +23,55 @@
  */
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
+namespace experimental
+{
+void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+    k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _kernel = std::move(k);
+}
+
+Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+MemoryRequirements CLStridedSlice::workspace() const
+{
+    return MemoryRequirements{};
+}
+} // namespace experimental
+
+struct CLStridedSlice::Impl
+{
+    const ICLTensor                              *src{ nullptr };
+    ICLTensor                                    *dst{ nullptr };
+    CLRuntimeContext                             *ctx{ nullptr };
+    std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+};
+
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+    _impl->ctx = ctx;
+}
+
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
+CLStridedSlice::~CLStridedSlice()                            = default;
+
 void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
@@ -40,15 +83,27 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC
                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
-    k->configure(compile_context, input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = arm_compute::support::cpp14::make_unique<experimental::CLStridedSlice>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
 
 Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
                                 const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                 int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
-    return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+void CLStridedSlice::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC, _impl->src } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+
+    _impl->op->run(src, dst, {});
 }
 } // namespace arm_compute
-- 
cgit v1.2.1