From bcd2352d7fd99a2f6aab220fa0c3b3f3119a1a4c Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Thu, 21 May 2020 15:02:36 +0100
Subject: COMPMID-3391: Implement Async interfaces

Change-Id: I8168cea5056ff48a0253ebb8c88ea549a3ea69a2
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3335
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 Android.bp                                         |  2 +
 arm_compute/core/CPP/ICPPKernel.h                  | 24 ++++++-
 .../core/NEON/kernels/NEReshapeLayerKernel.h       | 16 +++--
 arm_compute/core/experimental/Types.h              | 71 ++++++++++++++++++++
 arm_compute/runtime/CPP/CPPScheduler.h             | 16 ++++-
 arm_compute/runtime/IOperator.h                    | 66 ++++++++++++++++++
 arm_compute/runtime/IScheduler.h                   | 12 ++++
 arm_compute/runtime/NEON/INEOperator.h             | 67 +++++++++++++++++++
 .../NEON/functions/NEGenerateProposalsLayer.h      |  7 +-
 .../runtime/NEON/functions/NEReductionOperation.h  |  4 +-
 .../runtime/NEON/functions/NEReshapeLayer.h        | 51 ++++++++++++--
 .../runtime/NEON/functions/NESoftmaxLayer.h        |  8 +--
 arm_compute/runtime/OMP/OMPScheduler.h             | 15 ++++-
 arm_compute/runtime/OperatorTensor.h               | 75 +++++++++++++++++++++
 arm_compute/runtime/SingleThreadScheduler.h        | 10 ++-
 arm_compute/runtime/experimental/Types.h           | 42 ++++++++++++
 src/core/NEON/kernels/NEReshapeLayerKernel.cpp     | 45 ++++++-------
 src/runtime/CPP/CPPScheduler.cpp                   | 78 ++++++++++++++--------
 src/runtime/CPP/SingleThreadScheduler.cpp          |  8 +++
 src/runtime/NEON/INEOperator.cpp                   | 53 +++++++++++++++
 .../NEON/functions/NEGenerateProposalsLayer.cpp    | 20 +++---
 .../NEON/functions/NEReductionOperation.cpp        |  8 +--
 src/runtime/NEON/functions/NEReshapeLayer.cpp      | 38 ++++++++++-
 src/runtime/NEON/functions/NESoftmaxLayer.cpp      | 27 ++++----
 src/runtime/OMP/OMPScheduler.cpp                   | 35 +++++++++-
 src/runtime/OperatorTensor.cpp                     | 57 ++++++++++++++++
 tests/framework/instruments/SchedulerTimer.cpp     | 15 ++++-
 tests/validation/NEON/ReshapeOperator.cpp          | 78 ++++++++++++++++++++++
 28 files changed, 840 insertions(+), 108 deletions(-)
 create mode 100644 arm_compute/core/experimental/Types.h
 create mode 100644 arm_compute/runtime/IOperator.h
 create mode 100644 arm_compute/runtime/NEON/INEOperator.h
 create mode 100644 arm_compute/runtime/OperatorTensor.h
 create mode 100644 arm_compute/runtime/experimental/Types.h
 create mode 100644 src/runtime/NEON/INEOperator.cpp
 create mode 100644 src/runtime/OperatorTensor.cpp
 create mode 100644 tests/validation/NEON/ReshapeOperator.cpp

diff --git a/Android.bp b/Android.bp
index b484a1e909..2d12d27211 100644
--- a/Android.bp
+++ b/Android.bp
@@ -604,6 +604,7 @@ cc_library_static {
         "src/runtime/MemoryManagerOnDemand.cpp",
         "src/runtime/MultiHOG.cpp",
         "src/runtime/MultiImage.cpp",
+        "src/runtime/NEON/INEOperator.cpp",
         "src/runtime/NEON/INESimpleFunction.cpp",
         "src/runtime/NEON/INESimpleFunctionNoBorder.cpp",
         "src/runtime/NEON/functions/NEAbsoluteDifference.cpp",
@@ -742,6 +743,7 @@ cc_library_static {
         "src/runtime/OMP/OMPScheduler.cpp",
         "src/runtime/OffsetLifetimeManager.cpp",
         "src/runtime/OffsetMemoryPool.cpp",
+        "src/runtime/OperatorTensor.cpp",
         "src/runtime/PoolManager.cpp",
         "src/runtime/Pyramid.cpp",
         "src/runtime/RuntimeContext.cpp",
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index ec05af20bd..21f6ab714a 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -26,10 +26,13 @@
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
 
 namespace arm_compute
 {
 class Window;
+class ITensor;
 
 /** Common interface for all kernels implemented in C++ */
 class ICPPKernel : public IKernel
@@ -51,8 +54,7 @@ public:
      */
     virtual void run(const Window &window, const ThreadInfo &info)
     {
-        ARM_COMPUTE_UNUSED(window);
-        ARM_COMPUTE_UNUSED(info);
+        ARM_COMPUTE_UNUSED(window, info);
         ARM_COMPUTE_ERROR("default implementation of legacy run() virtual member function invoked");
     }
 
@@ -69,6 +71,24 @@ public:
         run(window, info);
     }
 
+    /** Execute the kernel on the passed window
+     *
+     * @warning If is_parallelisable() returns false then the passed window must be equal to window()
+     *
+     * @note The window has to be a region within the window returned by the window() method
+     *
+     * @note The width of the window has to be a multiple of num_elems_processed_per_iteration().
+     *
+     * @param[in] inputs  A vector containing the input tensors.
+     * @param[in] outputs A vector containing the output tensors.
+     * @param[in] window  Region on which to execute the kernel. (Must be a region of the window returned by window())
+     * @param[in] info    Info about executing thread and CPU.
+     */
+    virtual void run_op(const std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs, const Window &window, const ThreadInfo &info)
+    {
+        ARM_COMPUTE_UNUSED(inputs, outputs, window, info);
+    }
+
     /** Name of the kernel
      *
      * @return Kernel name
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
index fccf2685a8..6f888e0914 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_NERESHAPELAYERKERNEL_H
 #define ARM_COMPUTE_NERESHAPELAYERKERNEL_H
 
+#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
@@ -32,19 +33,19 @@ namespace arm_compute
 class ITensor;
 
 /** Interface for the kernel to perform tensor reshaping */
-class NEReshapeLayerKernel : public INESimpleKernel
+class NEReshapeLayerKernel : public INEKernel
 {
 public:
     const char *name() const override
     {
         return "NEReshapeLayerKernel";
     }
-    /** Set the input and output of the kernel
+    /** Set the input and output info of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: All
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
+     * @param[in]  input  Source tensor info. Data type supported: All
+     * @param[out] output Destination tensor info. Data type supported: Same as @p input
      */
-    void configure(const ITensor *input, ITensor *output);
+    void configure(const ITensorInfo *input, ITensorInfo *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
      *
@@ -56,7 +57,8 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(const std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs, const Window &window, const ThreadInfo &info) override;
 };
+
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NERESHAPELAYERKERNEL_H */
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
new file mode 100644
index 0000000000..6043db9ff4
--- /dev/null
+++ b/arm_compute/core/experimental/Types.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_TYPES_H
+#define ARM_COMPUTE_EXPERIMENTAL_TYPES_H
+
+#include "arm_compute/core/TensorShape.h"
+
+#include <map>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Memory type */
+enum class TensorType
+{
+    ACL_SRC   = 0,
+    ACL_SRC_0 = 0,
+    ACL_SRC_1 = 1,
+    ACL_SRC_2 = 2,
+    ACL_DST   = 30,
+    ACL_DST_0 = 30,
+    ACL_DST_1 = 31,
+    ACL_INT   = 50,
+    ACL_INT_0 = 50,
+    ACL_INT_1 = 51,
+    ACL_INT_2 = 52
+};
+using InputOperatorTensors  = std::pair<TensorType /* tensor type */, const ITensor * /* tensor object */>;
+using OutputOperatorTensors = std::pair<TensorType /* tensor type */, ITensor * /* tensor object */>;
+using OperatorTensors       = OutputOperatorTensors;
+
+namespace experimental
+{
+struct MemoryInfo
+{
+    MemoryInfo(TensorType type, size_t size, size_t alignment)
+        : type(type), size(size), alignment(alignment)
+    {
+    }
+    TensorType type;
+    size_t     size;
+    size_t     alignment;
+};
+
+using MemoryRequirements = std::vector<MemoryInfo>;
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_EXPERIMENTAL_TYPES_H */
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index c8de41bf20..78ad43c2b4 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CPPSCHEDULER_H
 #define ARM_COMPUTE_CPPSCHEDULER_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IScheduler.h"
 
 #include <memory>
@@ -65,6 +66,18 @@ public:
      * @param[in] hints  Hints for the scheduler.
      */
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] inputs  Vector that contains the input tensors.
+     * @param[in] outputs Vector that contains the output tensors.
+     */
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs) override;
 
 protected:
     /** Will run the workloads in parallel using num_threads
@@ -74,6 +87,7 @@ protected:
     void run_workloads(std::vector<Workload> &workloads) override;
 
 private:
+    void schedule_common(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs);
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
diff --git a/arm_compute/runtime/IOperator.h b/arm_compute/runtime/IOperator.h
new file mode 100644
index 0000000000..110c935702
--- /dev/null
+++ b/arm_compute/runtime/IOperator.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_IOPERATOR_H
+#define ARM_COMPUTE_IOPERATOR_H
+
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Base class for all async functions */
+class IOperator
+{
+public:
+    /** Destructor */
+    virtual ~IOperator() = default;
+    /** Run the kernels contained in the function
+     *
+     *
+     * @param[in] inputs    Vector that contains the input tensors.
+     * @param[in] outputs   Vector that contains the output tensors.
+     * @param[in] workspace Vector that contains the workspace tensors.
+     *
+     */
+    virtual void run(std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs, std::vector<OperatorTensors *> &workspace) = 0;
+    /** Prepare the function for executing
+     *
+     * Any one off pre-processing step required by the function is handled here
+     *
+     * @param[in] constants Vector that contains the constants tensors.
+     *
+     * @note Prepare stage might not need all the function's buffers' backing memory to be available in order to execute
+     */
+    virtual void prepare(std::vector<OperatorTensors *> constants) = 0;
+
+    /** Return the memory requirements required by the workspace
+     */
+    virtual MemoryRequirements workspace() const = 0;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IOPERATOR_H */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index a5e20ee627..02d0cef086 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -25,6 +25,8 @@
 #define ARM_COMPUTE_ISCHEDULER_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
 
 #include <functional>
 #include <limits>
@@ -32,6 +34,7 @@
 namespace arm_compute
 {
 class ICPPKernel;
+class ITensor;
 
 /** Scheduler interface to run kernels */
 class IScheduler
@@ -147,6 +150,15 @@ public:
      */
     virtual void schedule(ICPPKernel *kernel, const Hints &hints) = 0;
 
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] inputs  Vector containing the input tensors.
+     * @param[in] outputs Vector containing the output tensors.
+     */
+    virtual void schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs) = 0;
+
     /** Execute all the passed workloads
      *
      * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
new file mode 100644
index 0000000000..4467e6d5ab
--- /dev/null
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_INEOPERATOR_H
+#define ARM_COMPUTE_INEOPERATOR_H
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Basic interface for functions which have a single async NEON kernel */
+class INEOperator : public IOperator
+{
+public:
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    INEOperator(IRuntimeContext *ctx = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEOperator(const INEOperator &) = delete;
+    /** Default move constructor */
+    INEOperator(INEOperator &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEOperator &operator=(const INEOperator &) = delete;
+    /** Default move assignment operator */
+    INEOperator &operator=(INEOperator &&) = default;
+
+    // Inherited methods overridden:
+    void run(std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs, std::vector<OperatorTensors *> &workspace) override final;
+    void prepare(std::vector<OperatorTensors *> constants) override final;
+
+protected:
+    std::unique_ptr<INEKernel> _kernel;
+    IRuntimeContext           *_ctx;
+    MemoryRequirements         _workspace;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_INEOPERATOR_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 7c470fbaf0..7260434606 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -23,18 +23,19 @@
  */
 #ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
 #define ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
+
 #include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
 #include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -112,9 +113,9 @@ private:
 
     // Neon kernels
     NEPermuteKernel              _permute_deltas_kernel;
-    NEReshapeLayerKernel         _flatten_deltas_kernel;
+    NEReshapeLayer               _flatten_deltas;
     NEPermuteKernel              _permute_scores_kernel;
-    NEReshapeLayerKernel         _flatten_scores_kernel;
+    NEReshapeLayer               _flatten_scores;
     NEComputeAllAnchorsKernel    _compute_anchors_kernel;
     NEBoundingBoxTransformKernel _bounding_box_kernel;
     NEPadLayerKernel             _pad_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index abda4159ba..78e8b04dbb 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -28,8 +28,8 @@
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -76,7 +76,7 @@ private:
     MemoryGroup                _memory_group;
     NEReductionOperationKernel _reduction_kernel;
     NEFillBorderKernel         _fill_border_kernel;
-    NEReshapeLayerKernel       _reshape_kernel;
+    NEReshapeLayer             _reshape;
     Tensor                     _output_internal;
     size_t                     _window_split;
     int                        _reduction_axis;
diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
index d6643842d9..5a296a776d 100644
--- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,11 @@
 #ifndef ARM_COMPUTE_NERESHAPELAYER_H
 #define ARM_COMPUTE_NERESHAPELAYER_H
 
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/runtime/Types.h"
 
 namespace arm_compute
 {
@@ -33,24 +36,62 @@ namespace arm_compute
 class ITensor;
 
 /** Basic function to run @ref NEReshapeLayerKernel */
-class NEReshapeLayer : public INESimpleFunctionNoBorder
+class NEReshapeLayer : public IFunction
 {
 public:
     /** Initialise the kernel's inputs and outputs
      *
-     * @param[in]  input  First tensor input. Data type supported: All
+     * @param[in]  input  Input tensor. Data type supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayer
      *
-     * @param[in] input  First tensor info. Data type supported: All
+     * @param[in] input  Input tensor info. Data type supported: All
      * @param[in] output Output tensor info. Data type supported: Same as @p input
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    const ITensor *_input
+    {
+        nullptr
+    };
+    ITensor                              *_output{ nullptr };
+    std::unique_ptr<NEReshapeLayerKernel> _kernel{ nullptr };
+};
+
+namespace experimental
+{
+/** Basic function to run @ref NEReshapeLayerKernel */
+class NEReshapeLayer : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  input  Input tensor info. Data type supported: All
+     * @param[out] output Output info. Data type supported: Same as @p input
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayer
+     *
+     * @param[in] input  Input tensor info. Data type supported: All
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NERESHAPELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index c5c83d8b5a..51d981de44 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -25,11 +25,11 @@
 #define ARM_COMPUTE_NESOFTMAXLAYER_H
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -119,9 +119,9 @@ private:
     MemoryGroup                     _memory_group;
     NELogits1DMaxKernel             _max_kernel;
     NELogits1DSoftmaxKernel<IS_LOG> _softmax_kernel;
-    std::unique_ptr<INEKernel>      _flat_or_reshape_kernel_ptr;
+    std::unique_ptr<IFunction>      _flat_or_reshape_ptr;
     NEFillBorderKernel              _fill_border_kernel;
-    NEReshapeLayerKernel            _reshape_kernel;
+    NEReshapeLayer                  _reshape;
     Tensor                          _max;
     Tensor                          _tmp;
     Tensor                          _input_flattened;
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index ed00833a9c..8ed1705a97 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,6 +55,19 @@ public:
      */
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
 
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] inputs  Vector containing the input tensors.
+     * @param[in] outputs Vector containing the output tensors.
+     */
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs) override;
+
 protected:
     /** Execute all the passed workloads
      *
diff --git a/arm_compute/runtime/OperatorTensor.h b/arm_compute/runtime/OperatorTensor.h
new file mode 100644
index 0000000000..3901f93291
--- /dev/null
+++ b/arm_compute/runtime/OperatorTensor.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_OPERATORTENSOR_H
+#define ARM_COMPUTE_OPERATORTENSOR_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Types.h"
+#include "arm_compute/runtime/experimental/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class TensorInfo;
+class IRuntimeContext;
+class IMemory;
+namespace experimental
+{
+/** Basic implementation of the tensor interface */
+class OperatorTensor : public ITensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] info   Pointer to the tensor info.
+     * @param[in] memory Pointer to the memory info.
+     *
+     */
+    OperatorTensor(ITensorInfo *info, IMemory *memory);
+    /** Destructor: free the tensor's memory */
+    ~OperatorTensor() = default;
+    /** Allow instances of this class to be move constructed */
+    OperatorTensor(OperatorTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    OperatorTensor &operator=(OperatorTensor &&) = default;
+    /** Prevent instances of this class to be copy assigned */
+    OperatorTensor &operator=(const OperatorTensor &) = delete;
+    /** Prevent instances of this class to be copy constructed */
+    OperatorTensor(const OperatorTensor &) = delete;
+
+    // Inherited methods overridden:
+    arm_compute::ITensorInfo *info() const override;
+    arm_compute::ITensorInfo *info() override;
+    uint8_t                  *buffer() const override;
+
+private:
+    arm_compute::ITensorInfo *_info;
+    arm_compute::IMemory     *_memory;
+    MemoryType                _mem_type;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_OPERATORTENSOR_H */
diff --git a/arm_compute/runtime/SingleThreadScheduler.h b/arm_compute/runtime/SingleThreadScheduler.h
index 3f279ebb19..8094758249 100644
--- a/arm_compute/runtime/SingleThreadScheduler.h
+++ b/arm_compute/runtime/SingleThreadScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,14 @@ public:
      * @param[in] hints  Hints for the scheduler.
      */
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] inputs  Vector containing the input tensors.
+     * @param[in] outputs Vector containing the output tensors.
+     */
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs) override;
 
 protected:
     /** Will run the workloads sequentially and in order.
diff --git a/arm_compute/runtime/experimental/Types.h b/arm_compute/runtime/experimental/Types.h
new file mode 100644
index 0000000000..bced0072b8
--- /dev/null
+++ b/arm_compute/runtime/experimental/Types.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H
+#define ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Memory type */
+enum class MemoryType
+{
+    CPU,
+    CL,
+    GLES
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H */
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 53fcfd724d..600f8f9bf1 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,13 +31,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
 #include <cstdint>
 
 /** [NEReshapeLayerKernel Kernel] **/
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
@@ -71,56 +72,54 @@ inline void reshape_tensor(const Window &window, const ITensor *input, ITensor *
 }
 } // namespace
 
-void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
+void NEReshapeLayerKernel::configure(const ITensorInfo *input, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info());
+    Window win = calculate_max_window(*input);
 
     // Set the output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     INEKernel::configure(win);
 }
 
-Status NEReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-
-    return Status{};
-}
-
-void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
+void NEReshapeLayerKernel::run_op(const std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_input->info()->data_type())
+    switch(inputs[0]->second->info()->data_type())
     {
         case DataType::U8:
         case DataType::S8:
         case DataType::QASYMM8:
         case DataType::QASYMM8_SIGNED:
-            reshape_tensor<uint8_t>(window, _input, _output);
+            reshape_tensor<uint8_t>(window, inputs[0]->second, outputs[0]->second);
             break;
         case DataType::U16:
         case DataType::S16:
         case DataType::F16:
-            reshape_tensor<uint16_t>(window, _input, _output);
+            reshape_tensor<uint16_t>(window, inputs[0]->second, outputs[0]->second);
             break;
         case DataType::U32:
         case DataType::S32:
         case DataType::F32:
-            reshape_tensor<uint32_t>(window, _input, _output);
+            reshape_tensor<uint32_t>(window, inputs[0]->second, outputs[0]->second);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type!");
     }
 }
+
+Status NEReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+    return Status{};
+}
+} // namespace arm_compute
 /** [NEReshapeLayerKernel Kernel] **/
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 0a03497cb9..db551590ea 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -95,10 +95,10 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std:
 
     // nt = sqrt(max_threads * (m / n) )
     const unsigned adjusted = std::round(
-                    std::sqrt(max_threads * ratio));
+                                  std::sqrt(max_threads * ratio));
 
     //find the nearest factor of max_threads
-    for(unsigned i = 0; i!= adjusted; ++i)
+    for(unsigned i = 0; i != adjusted; ++i)
     {
         //try down
         const unsigned adj_down = adjusted - i;
@@ -118,11 +118,11 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std:
     //we didn't find anything so lets bail out with maxes biased to the largest dimension
     if(m > n)
     {
-         return{ std::min<unsigned>(m, max_threads), 1 };
+        return { std::min<unsigned>(m, max_threads), 1 };
     }
     else
     {
-        return{ 1, std::min<unsigned>(n, max_threads) };
+        return { 1, std::min<unsigned>(n, max_threads) };
     }
 }
 
@@ -144,7 +144,6 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
     }
     while(feeder.get_next(workload_index));
 }
-
 } //namespace
 
 struct CPPScheduler::Impl final
@@ -364,11 +363,11 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
 }
 #endif /* DOXYGEN_SKIP_THIS */
 
-void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
+void CPPScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
-    const Window      &max_window     = kernel->window();
+    const Window &max_window = kernel->window();
 
     if(hints.split_dimension() == IScheduler::split_dimensions_all)
     {
@@ -379,34 +378,32 @@ void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
         const std::size_t m = max_window.num_iterations(Window::DimX);
         const std::size_t n = max_window.num_iterations(Window::DimY);
 
-       //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
+        //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
         unsigned m_threads, n_threads;
         std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
 
         std::vector<IScheduler::Workload> workloads;
-        for(unsigned int ni  = 0; ni != n_threads; ++ni)
+        for(unsigned int ni = 0; ni != n_threads; ++ni)
         {
-            for(unsigned int mi  = 0; mi != m_threads; ++mi)
+            for(unsigned int mi = 0; mi != m_threads; ++mi)
             {
                 workloads.push_back(
-                    [ ni, mi, m_threads, n_threads, &max_window, &kernel ]
-                    (const ThreadInfo & info)
-                    {
-                        //narrow the window to our mi-ni workload
-                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
-                                               .split_window(Window::DimY, ni, n_threads);
+                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
+                {
+                    //narrow the window to our mi-ni workload
+                    Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                 .split_window(Window::DimY, ni, n_threads);
 
-                        win.validate();
+                    win.validate();
 
-                        Window thread_locator;
-                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
-                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+                    Window thread_locator;
+                    thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                    thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
 
-                        thread_locator.validate();
+                    thread_locator.validate();
 
-                        kernel->run_nd(win, info, thread_locator);
-                    }
-                );
+                    kernel->run_nd(win, info, thread_locator);
+                });
             }
         }
         run_workloads(workloads);
@@ -425,7 +422,14 @@ void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
         {
             ThreadInfo info;
             info.cpu_info = &_cpu_info;
-            kernel->run(max_window, info);
+            if(inputs.empty())
+            {
+                kernel->run(max_window, info);
+            }
+            else
+            {
+                kernel->run_op(inputs, outputs, max_window, info);
+            }
         }
         else
         {
@@ -449,15 +453,35 @@ void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
             for(unsigned int t = 0; t < num_windows; t++)
             {
                 //Capture 't' by copy, all the other variables by reference:
-                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &inputs, &outputs](const ThreadInfo & info)
                 {
                     Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                     win.validate();
-                    kernel->run(win, info);
+
+                    if(inputs.empty())
+                    {
+                        kernel->run(win, info);
+                    }
+                    else
+                    {
+                        kernel->run_op(inputs, outputs, win, info);
+                    }
                 };
             }
             run_workloads(workloads);
         }
     }
 }
+
+void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs)
+{
+    schedule_common(kernel, hints, inputs, outputs);
+}
+
+void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
+{
+    std::vector<InputOperatorTensors *>  inputs;
+    std::vector<OutputOperatorTensors *> outputs;
+    schedule_common(kernel, hints, inputs, outputs);
+}
 } // namespace arm_compute
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 660a79652c..777f84bec8 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -49,6 +49,14 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
     kernel->run(kernel->window(), info);
 }
 
+void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs)
+{
+    ARM_COMPUTE_UNUSED(hints);
+    ThreadInfo info;
+    info.cpu_info = &_cpu_info;
+    kernel->run_op(inputs, outputs, kernel->window(), info);
+}
+
 void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads)
 {
     ThreadInfo info;
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
new file mode 100644
index 0000000000..c24d5c47f1
--- /dev/null
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+INEOperator::INEOperator(IRuntimeContext *ctx)
+    : _kernel(), _ctx(ctx), _workspace()
+{
+}
+
+void INEOperator::run(std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs, std::vector<OperatorTensors *> &workspace)
+{
+    ARM_COMPUTE_UNUSED(workspace);
+
+    if(inputs.empty() || outputs.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, inputs, outputs);
+}
+
+void INEOperator::prepare(std::vector<OperatorTensors *> constants)
+{
+    ARM_COMPUTE_UNUSED(constants);
+}
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 82880bac85..dabbebacb4 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -31,9 +31,9 @@ namespace arm_compute
 NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
       _permute_deltas_kernel(),
-      _flatten_deltas_kernel(),
+      _flatten_deltas(),
       _permute_scores_kernel(),
-      _flatten_scores_kernel(),
+      _flatten_scores(),
       _compute_anchors_kernel(),
       _bounding_box_kernel(),
       _pad_kernel(),
@@ -95,12 +95,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     {
         _memory_group.manage(&_deltas_permuted);
         _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+        _flatten_deltas.configure(deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -112,12 +112,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     {
         _memory_group.manage(&_scores_permuted);
         _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+        _flatten_scores.configure(scores, &_scores_flattened);
     }
 
     Tensor *anchors_to_use = &_all_anchors;
@@ -244,12 +244,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
     TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
     TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -327,8 +327,8 @@ void NEGenerateProposalsLayer::run()
         NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
     }
 
-    NEScheduler::get().schedule(&_flatten_deltas_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_flatten_scores_kernel, Window::DimY);
+    _flatten_deltas.run();
+    _flatten_scores.run();
 
     if(_is_qasymm8)
     {
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 80ebe6731a..a895147cc9 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -54,7 +54,7 @@ size_t reduction_window_split_dimension(unsigned int axis)
 } // namespace
 
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
 {
 }
 
@@ -91,7 +91,7 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
     if(is_reshape_required)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
     }
 
     return Status{};
@@ -171,7 +171,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
 
     if(_is_reshape_required)
     {
-        _reshape_kernel.configure(output_internal, output);
+        _reshape.configure(output_internal, output);
         _output_internal.allocator()->allocate();
     }
 }
@@ -185,7 +185,7 @@ void NEReductionOperation::run()
     NEScheduler::get().schedule(&_reduction_kernel, _window_split);
     if(_is_reshape_required)
     {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+        _reshape.run();
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 0a9f42d510..680abef026 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -25,19 +25,44 @@
 
 #include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Types.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+namespace experimental
+{
+void NEReshapeLayer::configure(const ITensorInfo *input, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
+Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::NEReshapeLayer::validate(input, output);
+}
+
+MemoryRequirements NEReshapeLayer::workspace() const
+{
+    return MemoryRequirements{};
+}
+} // namespace experimental
+
+void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+{
+    _input  = input;
+    _output = output;
+
+    auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
+    k->configure(input->info(), output->info());
+    _kernel = std::move(k);
+}
+
 Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -45,4 +70,15 @@ Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *out
 
     return Status{};
 }
+
+void NEReshapeLayer::run()
+{
+    InputOperatorTensors  src_0 = std::make_pair(TensorType::ACL_SRC, _input);
+    OutputOperatorTensors dst_0 = std::make_pair(TensorType::ACL_DST, _output);
+
+    std::vector<InputOperatorTensors *>  inputs  = { &src_0 };
+    std::vector<OutputOperatorTensors *> outputs = { &dst_0 };
+
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, inputs, outputs);
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 5509edec87..5cd6a550af 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,8 +32,8 @@ namespace arm_compute
 {
 template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
-      _output_flattened(), _needs_flattening(false)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_ptr(nullptr), _fill_border_kernel(), _reshape(), _max(), _tmp(), _input_flattened(), _output_flattened(),
+      _needs_flattening(false)
 {
 }
 
@@ -46,23 +46,20 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor
     // Initialize the flat input
     _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
-    // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
-    // If the number of reduced axes is 3 (max dimension), which means collapsing all axes except the batch axis, we use NEFlattenKernel.
-    // In all other cases we have to use NEReshapeKernel
     // Note that the "other cases" include both:
     //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
     //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by NEReshapeKernel instead of NEFlattenKernel.
     if(first_n_reduce_axes == 3)
     {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
+        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayer>();
         flatten_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
+        _flat_or_reshape_ptr = std::move(flatten_kernel_ptr);
     }
     else
     {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
+        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayer>();
         reshape_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
+        _flat_or_reshape_ptr = std::move(reshape_kernel_ptr);
     }
 
     // We need to init the output tensor here. Indeed, the reshape kernel expects
@@ -127,7 +124,7 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
         _input_flattened.allocator()->allocate();
 
         // Reshape the flat output into the requested (4D) output
-        _reshape_kernel.configure(&_output_flattened, output);
+        _reshape.configure(&_output_flattened, output);
 
         // Allocate the intermediate flat tensors
         _output_flattened.allocator()->allocate();
@@ -174,11 +171,11 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
 
         if(first_n_reduce_axes == 3)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &tensor_info_flat));
         }
         else
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(input, &tensor_info_flat));
         }
     }
 
@@ -195,7 +192,7 @@ void           NESoftmaxLayerGeneric<IS_LOG>::run()
 
     if(_needs_flattening)
     {
-        NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
+        _flat_or_reshape_ptr->run();
     }
 
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
@@ -204,11 +201,11 @@ void           NESoftmaxLayerGeneric<IS_LOG>::run()
 
     if(_needs_flattening)
     {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+        _reshape.run();
     }
 }
 
 template class NESoftmaxLayerGeneric<false>;
 template class NESoftmaxLayerGeneric<true>;
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index f67f06fc94..a1851f03c3 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,6 +83,39 @@ void OMPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
     }
 }
 
+void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+    ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC,
+                             "Dynamic scheduling is not supported in OMPScheduler");
+
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || num_threads == 1)
+    {
+        ThreadInfo info;
+        info.cpu_info = &_cpu_info;
+        kernel->run_op(inputs, outputs, max_window, info);
+    }
+    else
+    {
+        const unsigned int                num_windows = num_threads;
+        std::vector<IScheduler::Workload> workloads(num_windows);
+        for(unsigned int t = 0; t < num_windows; t++)
+        {
+            //Capture 't' by copy, all the other variables by reference:
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &inputs, &outputs](const ThreadInfo & info)
+            {
+                Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                win.validate();
+                kernel->run_op(inputs, outputs, win, info);
+            };
+        }
+        run_workloads(workloads);
+    }
+}
 #ifndef DOXYGEN_SKIP_THIS
 void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
 {
diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp
new file mode 100644
index 0000000000..5d4e126177
--- /dev/null
+++ b/src/runtime/OperatorTensor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OperatorTensor.h"
+#include "arm_compute/runtime/MemoryRegion.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+OperatorTensor::OperatorTensor(ITensorInfo *info, IMemory *memory)
+    : _info(info), _memory(memory), _mem_type(MemoryType::CPU)
+{
+}
+
+ITensorInfo *OperatorTensor::info() const
+{
+    return _info;
+}
+
+ITensorInfo *OperatorTensor::info()
+{
+    return _info;
+}
+
+uint8_t *OperatorTensor::buffer() const
+{
+    switch(_mem_type)
+    {
+        case MemoryType::CPU:
+            return (uint8_t *)dynamic_cast<MemoryRegion *>(_memory->region())->buffer();
+        default:
+            ARM_COMPUTE_ERROR("Memory type not supported.");
+    }
+}
+} // namespace experimental
+} // namespace arm_compute
diff --git a/tests/framework/instruments/SchedulerTimer.cpp b/tests/framework/instruments/SchedulerTimer.cpp
index 9e8bba28e8..58e1b56904 100644
--- a/tests/framework/instruments/SchedulerTimer.cpp
+++ b/tests/framework/instruments/SchedulerTimer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,6 +86,19 @@ public:
         _kernels.push_back(std::move(info));
     }
 
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs) override
+    {
+        _timer.start();
+        _real_scheduler.schedule_op(kernel, hints, inputs, outputs);
+        _timer.stop();
+
+        typename SchedulerClock<output_timestamps>::kernel_info info;
+        info.name         = kernel->name();
+        info.prefix       = _prefix;
+        info.measurements = _timer.measurements();
+        _kernels.push_back(std::move(info));
+    }
+
     void run_tagged_workloads(std::vector<Workload> &workloads, const char *tag) override
     {
         _timer.start();
diff --git a/tests/validation/NEON/ReshapeOperator.cpp b/tests/validation/NEON/ReshapeOperator.cpp
new file mode 100644
index 0000000000..82e9768a2c
--- /dev/null
+++ b/tests/validation/NEON/ReshapeOperator.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/OperatorTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/Utils.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(ReshapeOperator)
+
+TEST_CASE(Run, framework::DatasetMode::ALL)
+{
+    // Create tensors and info
+    TensorInfo src_info(TensorShape(27U, 11U, 3U), 1, DataType::F32);
+    TensorInfo dst_info(TensorShape(27U, 11U, 3U), 1, DataType::F32);
+    Tensor     src = create_tensor<Tensor>(TensorShape(27U, 11U, 3U), DataType::F32, 1);
+    Tensor     dst = create_tensor<Tensor>(TensorShape(27U, 11U, 3U), DataType::F32, 1);
+
+    // Create and configure function
+    experimental::NEReshapeLayer reshape_operator;
+    reshape_operator.configure(&src_info, &dst_info);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    InputOperatorTensors  src_0 = std::make_pair(TensorType::ACL_SRC, &src);
+    OutputOperatorTensors dst_0 = std::make_pair(TensorType::ACL_DST, &dst);
+
+    std::vector<InputOperatorTensors *>  src_vec  = { &src_0 };
+    std::vector<OutputOperatorTensors *> dst_vec  = { &dst_0 };
+    std::vector<OperatorTensors *>       work_vec = {};
+
+    // Compute functions
+    reshape_operator.run(src_vec, dst_vec, work_vec);
+}
+
+TEST_SUITE_END() // ReshapeOperator
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
-- 
cgit v1.2.1