9 files changed, 256 insertions, 55 deletions
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index 84789e70d2..ed27320650 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,25 +22,35 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
-#include "arm_compute/runtime/CL/CLMemoryRegion.h"
 
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
-using namespace arm_compute;
-
-CLBufferAllocator::CLBufferAllocator(cl::Context context)
-    : _context(std::move(context))
+namespace arm_compute
+{
+CLBufferAllocator::CLBufferAllocator(CLCoreRuntimeContext *ctx)
+    : _ctx(ctx)
 {
 }
 
 void *CLBufferAllocator::allocate(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    cl_mem buf = clCreateBuffer(_context.get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    cl_mem buf;
+    if(_ctx == nullptr)
+    {
+        buf = clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    }
+    else
+    {
+        buf = clCreateBuffer(_ctx->context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    }
     return static_cast<void *>(buf);
 }
 
@@ -53,5 +63,6 @@ void CLBufferAllocator::free(void *ptr)
 std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+    return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index edfc8ed2aa..c4c7ee2107 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 
 namespace
 {
@@ -103,4 +104,19 @@ create_opencl_context_and_device()
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
     return std::make_tuple(cl_context, device, err);
 }
+
+void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
+    if(ctx)
+    {
+        ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
+        ctx->gpu_scheduler()->enqueue(*kernel, flush);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(*kernel, flush);
+    }
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 2976903c93..52906a893f 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -23,13 +23,18 @@
  */
 #include "arm_compute/runtime/CL/CLMemoryRegion.h"
 
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
 {
-ICLMemoryRegion::ICLMemoryRegion(cl::Context ctx, size_t size)
-    : IMemoryRegion(size), _ctx(std::move(ctx)), _mapping(nullptr), _mem()
+ICLMemoryRegion::ICLMemoryRegion(CLCoreRuntimeContext *ctx, size_t size)
+    : IMemoryRegion(size),
+      _queue((ctx != nullptr) ? ctx->queue() : CLScheduler::get().queue()),
+      _ctx((ctx != nullptr) ? ctx->context() : CLScheduler::get().context()),
+      _mapping(nullptr),
+      _mem()
 {
 }
 
@@ -54,17 +59,17 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset,
     return nullptr;
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
-    : ICLMemoryRegion(std::move(ctx), size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size)
+    : ICLMemoryRegion(ctx, size)
 {
     if(_size != 0)
     {
-        _mem = cl::Buffer(_ctx, flags, _size);
+        _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), flags, _size);
     }
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
-    : ICLMemoryRegion(buffer.getInfo<CL_MEM_CONTEXT>(), buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer, CLCoreRuntimeContext *ctx)
+    : ICLMemoryRegion(ctx, buffer.getInfo<CL_MEM_SIZE>())
 {
     _mem = buffer;
 }
@@ -88,15 +93,15 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
     _mapping = nullptr;
 }
 
-ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLMemoryRegion(std::move(ctx), size), _ptr(nullptr)
+ICLSVMMemoryRegion::ICLSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLMemoryRegion(ctx, size), _ptr(nullptr)
 {
     if(size != 0)
     {
-        _ptr = clSVMAlloc(_ctx.get(), flags, size, alignment);
+        _ptr = clSVMAlloc((ctx != nullptr) ? ctx->context().get() : CLScheduler::get().context().get(), flags, size, alignment);
         if(_ptr != nullptr)
         {
-            _mem = cl::Buffer(_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
+            _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
         }
     }
 }
@@ -107,7 +112,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
     {
         try
         {
-            clFinish(CLScheduler::get().queue().get());
+            clFinish(_queue.get());
             _mem = cl::Buffer();
             clSVMFree(_ctx.get(), _ptr);
         }
@@ -122,8 +127,8 @@ void *ICLSVMMemoryRegion::ptr()
     return _ptr;
 }
 
-CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(ctx, flags, size, alignment)
 {
 }
 
@@ -142,8 +147,8 @@ void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q)
     _mapping = nullptr;
 }
 
-CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(ctx, flags, size, alignment)
 {
 }
 
@@ -162,4 +167,4 @@ void CLFineSVMMemoryRegion::unmap(cl::CommandQueue &q)
     ARM_COMPUTE_UNUSED(q);
     _mapping = nullptr;
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
new file mode 100644
index 0000000000..49e4c10c84
--- /dev/null
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLRuntimeContext::CLRuntimeContext()
+    : _gpu_owned_scheduler(support::cpp14::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _core_context()
+{
+    _symbols.load_default();
+    auto ctx_dev_err = create_opencl_context_and_device();
+    ARM_COMPUTE_ERROR_ON_MSG(std::get<2>(ctx_dev_err) != CL_SUCCESS, "Failed to create OpenCL context");
+    auto             ctx   = std::get<0>(ctx_dev_err);
+    auto             dev   = std::get<1>(ctx_dev_err);
+    cl::CommandQueue queue = cl::CommandQueue(ctx, dev);
+    _gpu_owned_scheduler->init(ctx, queue, dev, &_tuner);
+    const std::string cl_kernels_folder("./cl_kernels");
+    _kernel_lib.init(cl_kernels_folder, ctx, dev);
+    _core_context = CLCoreRuntimeContext(&_kernel_lib, _gpu_owned_scheduler->context(), _gpu_owned_scheduler->queue());
+}
+
+CLKernelLibrary &CLRuntimeContext::kernel_library()
+{
+    return _kernel_lib;
+}
+
+CLCoreRuntimeContext *CLRuntimeContext::core_runtime_context()
+{
+    return &_core_context;
+}
+
+void CLRuntimeContext::set_gpu_scheduler(CLScheduler *scheduler)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scheduler);
+    _gpu_scheduler = scheduler;
+}
+
+CLScheduler *CLRuntimeContext::gpu_scheduler()
+{
+    return _gpu_scheduler;
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 701ffe0ab1..e78eaa482f 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -23,13 +23,71 @@
  */
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "arm_compute/runtime/CL/CLHelpers.h"
-
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/tuners/Tuners.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+cl::Context &CLScheduler::context()
+{
+    ARM_COMPUTE_ERROR_ON(!_is_initialised);
+    _context = CLKernelLibrary::get().context();
+    return _context;
+}
+
+cl::CommandQueue &CLScheduler::queue()
+{
+    ARM_COMPUTE_ERROR_ON(!_is_initialised);
+    return _queue;
+}
+
+GPUTarget CLScheduler::target() const
+{
+    return _target;
+}
+
+void CLScheduler::set_queue(cl::CommandQueue queue)
+{
+    _queue = std::move(queue);
+}
+
+void CLScheduler::set_target(GPUTarget target)
+{
+    _target = target;
+}
+
+void CLScheduler::set_tuner(ICLTuner *tuner)
+{
+    _cl_tuner = tuner;
+}
+
+void CLScheduler::sync()
+{
+    _queue.finish();
+}
+
+cl::Event CLScheduler::enqueue_sync_event()
+{
+    cl::Event event;
+    _queue.enqueueMarker(&event);
+    return event;
+}
+
+void CLScheduler::tune_kernel_static(ICLKernel &kernel)
+{
+    if(_cl_tuner != nullptr)
+    {
+        _cl_tuner->tune_kernel_static(kernel);
+    }
+}
+
+bool CLScheduler::is_initialised() const
+{
+    return _is_initialised;
+}
 
 std::once_flag CLScheduler::_initialize_symbols;
 
@@ -49,8 +107,9 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx
 {
     if(!_is_initialised)
     {
-        cl::CommandQueue queue = cl::CommandQueue(ctx, device);
-        CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
+        const std::string cl_kernels_folder("./cl_kernels/");
+        cl::CommandQueue  queue = cl::CommandQueue(ctx, device);
+        CLKernelLibrary::get().init(cl_kernels_folder, ctx, device);
         init(ctx, queue, device, cl_tuner);
         _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
         _cl_tuner                = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
@@ -113,3 +172,4 @@ void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
         _queue.flush();
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index 9bbf926b58..a6d0cf77ca 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp
@@ -23,15 +23,21 @@
  */
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
 {
-CLTensor::CLTensor()
-    : _allocator(this)
+CLTensor::CLTensor(IRuntimeContext *ctx)
+    : _allocator(this, static_cast<CLRuntimeContext *>(ctx)), _ctx(static_cast<CLRuntimeContext *>(ctx))
 {
 }
 
+CLRuntimeContext *CLTensor::context()
+{
+    return _ctx;
+}
+
 TensorInfo *CLTensor::info() const
 {
     return &_allocator.info();
@@ -59,12 +65,12 @@ CLTensorAllocator *CLTensor::allocator()
 
 void CLTensor::map(bool blocking)
 {
-    ICLTensor::map(CLScheduler::get().queue(), blocking);
+    ICLTensor::map(_ctx == nullptr ? CLScheduler::get().queue() : _ctx->gpu_scheduler()->queue(), blocking);
 }
 
 void CLTensor::unmap()
 {
-    ICLTensor::unmap(CLScheduler::get().queue());
+    ICLTensor::unmap(_ctx == nullptr ? CLScheduler::get().queue() : _ctx->gpu_scheduler()->queue());
 }
 
 uint8_t *CLTensor::do_map(cl::CommandQueue &q, bool blocking)
@@ -81,4 +87,4 @@ void CLTensor::associate_memory_group(arm_compute::IMemoryGroup *memory_group)
 {
     _allocator.set_associated_memory_group(memory_group);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 2b5fbb8241..eaf46d42ca 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
@@ -41,10 +42,10 @@ namespace
  *
  * @return A wrapped memory region
  */
-std::unique_ptr<ICLMemoryRegion> allocate_region(const cl::Context &context, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(CLCoreRuntimeContext *ctx, size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
-    std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(context,
+    std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(ctx,
                                                                                                  CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
                                                                                                  size,
                                                                                                  alignment);
@@ -52,12 +53,12 @@ std::unique_ptr<ICLMemoryRegion> allocate_region(const cl::Context &context, siz
     // Try coarse-grain SVM in case of failure
     if(region != nullptr && region->ptr() == nullptr)
     {
-        region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+        region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(ctx, CL_MEM_READ_WRITE, size, alignment);
     }
     // Try legacy buffer memory in case of failure
     if(region != nullptr && region->ptr() == nullptr)
     {
-        region = support::cpp14::make_unique<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+        region = support::cpp14::make_unique<CLBufferMemoryRegion>(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
     }
     return region;
 }
@@ -103,8 +104,8 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
 }
 } // namespace
 
-CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _scale(), _offset()
+CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext *ctx)
+    : _ctx(ctx), _owner(owner), _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _scale(), _offset()
 {
 }
 
@@ -129,7 +130,15 @@ void CLTensorAllocator::allocate()
     if(_associated_memory_group == nullptr)
     {
         // Perform memory allocation
-        _memory.set_owned_region(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
+        if(_ctx == nullptr)
+        {
+            auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
+            _memory.set_owned_region(allocate_region(&legacy_ctx, info().total_size(), 0));
+        }
+        else
+        {
+            _memory.set_owned_region(allocate_region(_ctx->core_runtime_context(), info().total_size(), 0));
+        }
     }
     else
     {
@@ -162,9 +171,17 @@ Status CLTensorAllocator::import_memory(cl::Buffer buffer)
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
 
-    _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer));
-    info().set_is_resizable(false);
+    if(_ctx == nullptr)
+    {
+        auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
+        _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, &legacy_ctx));
+    }
+    else
+    {
+        _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, _ctx->core_runtime_context()));
+    }
 
+    info().set_is_resizable(false);
     return Status{};
 }
 
@@ -179,13 +196,28 @@ void CLTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_mem
 
 uint8_t *CLTensorAllocator::lock()
 {
-    return map(CLScheduler::get().queue(), true);
+    if(_ctx)
+    {
+        return map(_ctx->gpu_scheduler()->queue(), true);
+    }
+    else
+    {
+        return map(CLScheduler::get().queue(), true);
+    }
 }
 
 void CLTensorAllocator::unlock()
 {
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
+    if(_ctx)
+    {
+        unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
+    }
+    else
+    {
+        //Legacy singleton api
+        unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
+    }
 }
 
 uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index a1a56fd06c..fb8eba8aa4 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,21 @@
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
-ICLSimpleFunction::ICLSimpleFunction() // NOLINT
+ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
     : _kernel(),
-      _border_handler()
+      _border_handler(),
+      _ctx(ctx)
 {
 }
 
 void ICLSimpleFunction::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured");
-
-    CLScheduler::get().enqueue(_border_handler, false);
-    CLScheduler::get().enqueue(*_kernel);
+    schedule_kernel_on_ctx(_ctx, &_border_handler, false);
+    schedule_kernel_on_ctx(_ctx, _kernel.get());
 }
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 2b66795cf9..00dbb71f4c 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -25,18 +25,21 @@
 
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
 {
-CLActivationLayer::CLActivationLayer(void *ctx)
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
+    : ICLSimpleFunction(ctx)
 {
-    ARM_COMPUTE_UNUSED(ctx);
 }
 
 void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
+    auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr;
+
+    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>(core_ctx);
     k->configure(input, output, act_info);
     _kernel = std::move(k);
 }