From db8485ac24135f17e9882c76196924435abc064f Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Tue, 24 Sep 2019 11:03:47 +0100
Subject: COMPMID-2205: CL runtime context.

CL Interfaces implemented.
Concrete classes implemented.
One test (ActivationLayer) ported to the new interface.

Change-Id: I283808bec36ccfc2f13fe048c45cbbee698ce525
Signed-off-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1998
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/CLCoreRuntimeContext.cpp            | 52 ++++++++++++++++++
 src/core/CL/CLHelpers.cpp                       | 16 ++++++
 src/core/CL/CLKernelLibrary.cpp                 | 65 +++++++++++++++++++++++
 src/core/CL/OpenCL.cpp                          |  5 +-
 src/core/CL/kernels/CLActivationLayerKernel.cpp |  8 +--
 src/graph/backends/CL/CLDeviceBackend.cpp       |  7 +--
 src/runtime/CL/CLBufferAllocator.cpp            | 27 +++++++---
 src/runtime/CL/CLHelpers.cpp                    | 16 ++++++
 src/runtime/CL/CLMemoryRegion.cpp               | 39 ++++++++------
 src/runtime/CL/CLRuntimeContext.cpp             | 67 +++++++++++++++++++++++
 src/runtime/CL/CLScheduler.cpp                  | 70 +++++++++++++++++++++++--
 src/runtime/CL/CLTensor.cpp                     | 16 ++++--
 src/runtime/CL/CLTensorAllocator.cpp            | 54 +++++++++++++++----
 src/runtime/CL/ICLSimpleFunction.cpp            | 13 ++---
 src/runtime/CL/functions/CLActivationLayer.cpp  |  9 ++--
 src/runtime/GLES_COMPUTE/GCTensor.cpp           |  4 +-
 src/runtime/Tensor.cpp                          |  4 +-
 17 files changed, 405 insertions(+), 67 deletions(-)
 create mode 100644 src/core/CL/CLCoreRuntimeContext.cpp
 create mode 100644 src/runtime/CL/CLRuntimeContext.cpp

(limited to 'src')

diff --git a/src/core/CL/CLCoreRuntimeContext.cpp b/src/core/CL/CLCoreRuntimeContext.cpp
new file mode 100644
index 0000000000..f9efad2c0d
--- /dev/null
+++ b/src/core/CL/CLCoreRuntimeContext.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
+
+namespace arm_compute
+{
+cl::Context CLCoreRuntimeContext::context()
+{
+    return _ctx;
+}
+
+cl::CommandQueue CLCoreRuntimeContext::queue()
+{
+    return _queue;
+}
+
+CLCoreRuntimeContext::CLCoreRuntimeContext()
+    : _kernel_lib(nullptr), _ctx(), _queue()
+{
+}
+
+CLCoreRuntimeContext::CLCoreRuntimeContext(CLKernelLibrary *kernel_lib, cl::Context ctx, cl::CommandQueue queue)
+    : _kernel_lib(kernel_lib), _ctx(ctx), _queue(queue)
+{
+}
+
+CLKernelLibrary *CLCoreRuntimeContext::kernel_library() const
+{
+    return _kernel_lib;
+}
+} // namespace arm_compute
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index d051810090..a3c73677c7 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -22,6 +22,8 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
@@ -283,4 +285,18 @@ bool preferred_dummy_work_items_support(const cl::Device &device)
     // TODO (COMPMID-2044)
     return true;
 }
+
+cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &kernel_name, const CLBuildOptions &build_opts)
+{
+    if(ctx && ctx->kernel_library())
+    {
+        //New api going through the core context
+        return static_cast<cl::Kernel>(ctx->kernel_library()->create_kernel(kernel_name, build_opts.options()));
+    }
+    else
+    {
+        //Legacy code through the singleton
+        return static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+    }
+}
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 7b7263fca7..c27f886129 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -1144,6 +1144,49 @@ Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const Stri
     return Kernel(kernel_name, cl_program);
 }
 
+void CLKernelLibrary::init(std::string kernel_path, cl::Context context, cl::Device device)
+{
+    _kernel_path = std::move(kernel_path);
+    _context     = std::move(context);
+    _device      = std::move(device);
+}
+
+void CLKernelLibrary::set_kernel_path(const std::string &kernel_path)
+{
+    _kernel_path = kernel_path;
+}
+
+cl::Context &CLKernelLibrary::context()
+{
+    return _context;
+}
+
+cl::Device &CLKernelLibrary::get_device()
+{
+    return _device;
+}
+
+void CLKernelLibrary::set_device(cl::Device device)
+{
+    _device = std::move(device);
+}
+
+std::string CLKernelLibrary::get_kernel_path()
+{
+    return _kernel_path;
+}
+
+void CLKernelLibrary::clear_programs_cache()
+{
+    _programs_map.clear();
+    _built_programs_map.clear();
+}
+
+const std::map<std::string, cl::Program> &CLKernelLibrary::get_built_programs() const
+{
+    return _built_programs_map;
+}
+
 void CLKernelLibrary::add_built_program(const std::string &built_program_name, const cl::Program &program)
 {
     _built_programs_map.emplace(built_program_name, program);
@@ -1205,6 +1248,28 @@ const Program &CLKernelLibrary::load_program(const std::string &program_name) co
     return new_program.first->second;
 }
 
+void CLKernelLibrary::set_context(cl::Context context)
+{
+    _context = std::move(context);
+    if(_context.get() == nullptr)
+    {
+        _device = cl::Device();
+    }
+    else
+    {
+        const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+        if(cl_devices.empty())
+        {
+            _device = cl::Device();
+        }
+        else
+        {
+            _device = cl_devices[0];
+        }
+    }
+}
+
 std::string CLKernelLibrary::stringify_set(const StringSet &s) const
 {
     std::string concat_set;
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 1ce1b526d7..74c5b041d7 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #pragma GCC diagnostic pop
 
+#include "arm_compute/core/Error.h"
+
 #include <dlfcn.h>
 #include <iostream>
 
@@ -54,6 +56,7 @@ bool CLSymbols::load_default()
     {
         if(load(lib))
         {
+            ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library");
             return true;
         }
     }
@@ -948,4 +951,4 @@ clImportMemoryARM(cl_context                      context,
         }
         return nullptr;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 97a0ff6c6c..5062fd1801 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
@@ -111,8 +111,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLActivationLayerKernel::CLActivationLayerKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLActivationLayerKernel::CLActivationLayerKernel(CLCoreRuntimeContext *ctx)
+    : _input(nullptr), _output(nullptr), _run_in_place(false), _ctx(ctx)
 {
 }
 
@@ -205,8 +205,8 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     {
         kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
     }
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
+    _kernel = create_opencl_kernel(_ctx, kernel_name, build_opts);
     // Make sure _kernel is initialized before calling the parent's configure
     _input  = input;
     _output = output;
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index ea3b6b801a..58c666c3cc 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -34,6 +34,7 @@
 #include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
 #include "arm_compute/graph/backends/CL/CLTensorHandle.h"
 
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
@@ -64,7 +65,7 @@ bool file_exists(const std::string &filename)
 static detail::BackendRegistrar<CLDeviceBackend> CLDeviceBackend_registrar(Target::CL);
 
 CLDeviceBackend::CLDeviceBackend()
-    : _context_count(0), _tuner(), _allocator(nullptr), _tuner_file()
+    : _context_count(0), _tuner(), _allocator(nullptr), _tuner_file(), _legacy_ctx()
 {
 }
 
@@ -91,9 +92,9 @@ void CLDeviceBackend::initialize_backend()
 {
     // Setup Scheduler
     CLScheduler::get().default_init(&_tuner);
-
+    _legacy_ctx = support::cpp14::make_unique<CLCoreRuntimeContext>(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
     // Create allocator with new context
-    _allocator = support::cpp14::make_unique<CLBufferAllocator>();
+    _allocator = support::cpp14::make_unique<CLBufferAllocator>(_legacy_ctx.get());
 }
 
 void CLDeviceBackend::release_backend_context(GraphContext &ctx)
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index 84789e70d2..ed27320650 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,25 +22,35 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
-#include "arm_compute/runtime/CL/CLMemoryRegion.h"
 
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
-using namespace arm_compute;
-
-CLBufferAllocator::CLBufferAllocator(cl::Context context)
-    : _context(std::move(context))
+namespace arm_compute
+{
+CLBufferAllocator::CLBufferAllocator(CLCoreRuntimeContext *ctx)
+    : _ctx(ctx)
 {
 }
 
 void *CLBufferAllocator::allocate(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    cl_mem buf = clCreateBuffer(_context.get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    cl_mem buf;
+    if(_ctx == nullptr)
+    {
+        buf = clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    }
+    else
+    {
+        buf = clCreateBuffer(_ctx->context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr);
+    }
     return static_cast<void *>(buf);
 }
 
@@ -53,5 +63,6 @@ void CLBufferAllocator::free(void *ptr)
 std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+    return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index edfc8ed2aa..c4c7ee2107 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 
 namespace
 {
@@ -103,4 +104,19 @@ create_opencl_context_and_device()
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
     return std::make_tuple(cl_context, device, err);
 }
+
+void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
+    if(ctx)
+    {
+        ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
+        ctx->gpu_scheduler()->enqueue(*kernel, flush);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(*kernel, flush);
+    }
+}
+
 } // namespace arm_compute
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 2976903c93..52906a893f 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -23,13 +23,18 @@
  */
 #include "arm_compute/runtime/CL/CLMemoryRegion.h"
 
+#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
 {
-ICLMemoryRegion::ICLMemoryRegion(cl::Context ctx, size_t size)
-    : IMemoryRegion(size), _ctx(std::move(ctx)), _mapping(nullptr), _mem()
+ICLMemoryRegion::ICLMemoryRegion(CLCoreRuntimeContext *ctx, size_t size)
+    : IMemoryRegion(size),
+      _queue((ctx != nullptr) ? ctx->queue() : CLScheduler::get().queue()),
+      _ctx((ctx != nullptr) ? ctx->context() : CLScheduler::get().context()),
+      _mapping(nullptr),
+      _mem()
 {
 }
 
@@ -54,17 +59,17 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset,
     return nullptr;
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
-    : ICLMemoryRegion(std::move(ctx), size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size)
+    : ICLMemoryRegion(ctx, size)
 {
     if(_size != 0)
     {
-        _mem = cl::Buffer(_ctx, flags, _size);
+        _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), flags, _size);
     }
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
-    : ICLMemoryRegion(buffer.getInfo<CL_MEM_CONTEXT>(), buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer, CLCoreRuntimeContext *ctx)
+    : ICLMemoryRegion(ctx, buffer.getInfo<CL_MEM_SIZE>())
 {
     _mem = buffer;
 }
@@ -88,15 +93,15 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
     _mapping = nullptr;
 }
 
-ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLMemoryRegion(std::move(ctx), size), _ptr(nullptr)
+ICLSVMMemoryRegion::ICLSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLMemoryRegion(ctx, size), _ptr(nullptr)
 {
     if(size != 0)
     {
-        _ptr = clSVMAlloc(_ctx.get(), flags, size, alignment);
+        _ptr = clSVMAlloc((ctx != nullptr) ? ctx->context().get() : CLScheduler::get().context().get(), flags, size, alignment);
         if(_ptr != nullptr)
         {
-            _mem = cl::Buffer(_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
+            _mem = cl::Buffer((ctx != nullptr) ? ctx->context() : CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
         }
     }
 }
@@ -107,7 +112,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
     {
         try
         {
-            clFinish(CLScheduler::get().queue().get());
+            clFinish(_queue.get());
             _mem = cl::Buffer();
             clSVMFree(_ctx.get(), _ptr);
         }
@@ -122,8 +127,8 @@ void *ICLSVMMemoryRegion::ptr()
     return _ptr;
 }
 
-CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(ctx, flags, size, alignment)
 {
 }
 
@@ -142,8 +147,8 @@ void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q)
     _mapping = nullptr;
 }
 
-CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
-    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(ctx, flags, size, alignment)
 {
 }
 
@@ -162,4 +167,4 @@ void CLFineSVMMemoryRegion::unmap(cl::CommandQueue &q)
     ARM_COMPUTE_UNUSED(q);
     _mapping = nullptr;
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
new file mode 100644
index 0000000000..49e4c10c84
--- /dev/null
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLRuntimeContext::CLRuntimeContext()
+    : _gpu_owned_scheduler(support::cpp14::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _core_context()
+{
+    _symbols.load_default();
+    auto ctx_dev_err = create_opencl_context_and_device();
+    ARM_COMPUTE_ERROR_ON_MSG(std::get<2>(ctx_dev_err) != CL_SUCCESS, "Failed to create OpenCL context");
+    auto             ctx   = std::get<0>(ctx_dev_err);
+    auto             dev   = std::get<1>(ctx_dev_err);
+    cl::CommandQueue queue = cl::CommandQueue(ctx, dev);
+    _gpu_owned_scheduler->init(ctx, queue, dev, &_tuner);
+    const std::string cl_kernels_folder("./cl_kernels");
+    _kernel_lib.init(cl_kernels_folder, ctx, dev);
+    _core_context = CLCoreRuntimeContext(&_kernel_lib, _gpu_owned_scheduler->context(), _gpu_owned_scheduler->queue());
+}
+
+CLKernelLibrary &CLRuntimeContext::kernel_library()
+{
+    return _kernel_lib;
+}
+
+CLCoreRuntimeContext *CLRuntimeContext::core_runtime_context()
+{
+    return &_core_context;
+}
+
+void CLRuntimeContext::set_gpu_scheduler(CLScheduler *scheduler)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scheduler);
+    _gpu_scheduler = scheduler;
+}
+
+CLScheduler *CLRuntimeContext::gpu_scheduler()
+{
+    return _gpu_scheduler;
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 701ffe0ab1..e78eaa482f 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -23,13 +23,71 @@
  */
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "arm_compute/runtime/CL/CLHelpers.h"
-
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/tuners/Tuners.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+cl::Context &CLScheduler::context()
+{
+    ARM_COMPUTE_ERROR_ON(!_is_initialised);
+    _context = CLKernelLibrary::get().context();
+    return _context;
+}
+
+cl::CommandQueue &CLScheduler::queue()
+{
+    ARM_COMPUTE_ERROR_ON(!_is_initialised);
+    return _queue;
+}
+
+GPUTarget CLScheduler::target() const
+{
+    return _target;
+}
+
+void CLScheduler::set_queue(cl::CommandQueue queue)
+{
+    _queue = std::move(queue);
+}
+
+void CLScheduler::set_target(GPUTarget target)
+{
+    _target = target;
+}
+
+void CLScheduler::set_tuner(ICLTuner *tuner)
+{
+    _cl_tuner = tuner;
+}
+
+void CLScheduler::sync()
+{
+    _queue.finish();
+}
+
+cl::Event CLScheduler::enqueue_sync_event()
+{
+    cl::Event event;
+    _queue.enqueueMarker(&event);
+    return event;
+}
+
+void CLScheduler::tune_kernel_static(ICLKernel &kernel)
+{
+    if(_cl_tuner != nullptr)
+    {
+        _cl_tuner->tune_kernel_static(kernel);
+    }
+}
+
+bool CLScheduler::is_initialised() const
+{
+    return _is_initialised;
+}
 
 std::once_flag CLScheduler::_initialize_symbols;
 
@@ -49,8 +107,9 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx
 {
     if(!_is_initialised)
     {
-        cl::CommandQueue queue = cl::CommandQueue(ctx, device);
-        CLKernelLibrary::get().init("./cl_kernels/", ctx, device);
+        const std::string cl_kernels_folder("./cl_kernels/");
+        cl::CommandQueue  queue = cl::CommandQueue(ctx, device);
+        CLKernelLibrary::get().init(cl_kernels_folder, ctx, device);
         init(ctx, queue, device, cl_tuner);
         _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target);
         _cl_tuner                = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner;
@@ -113,3 +172,4 @@ void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
         _queue.flush();
     }
 }
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index 9bbf926b58..a6d0cf77ca 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp
@@ -23,15 +23,21 @@
  */
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
 {
-CLTensor::CLTensor()
-    : _allocator(this)
+CLTensor::CLTensor(IRuntimeContext *ctx)
+    : _allocator(this, static_cast<CLRuntimeContext *>(ctx)), _ctx(static_cast<CLRuntimeContext *>(ctx))
 {
 }
 
+CLRuntimeContext *CLTensor::context()
+{
+    return _ctx;
+}
+
 TensorInfo *CLTensor::info() const
 {
     return &_allocator.info();
@@ -59,12 +65,12 @@ CLTensorAllocator *CLTensor::allocator()
 
 void CLTensor::map(bool blocking)
 {
-    ICLTensor::map(CLScheduler::get().queue(), blocking);
+    ICLTensor::map(_ctx == nullptr ? CLScheduler::get().queue() : _ctx->gpu_scheduler()->queue(), blocking);
 }
 
 void CLTensor::unmap()
 {
-    ICLTensor::unmap(CLScheduler::get().queue());
+    ICLTensor::unmap(_ctx == nullptr ? CLScheduler::get().queue() : _ctx->gpu_scheduler()->queue());
 }
 
 uint8_t *CLTensor::do_map(cl::CommandQueue &q, bool blocking)
@@ -81,4 +87,4 @@ void CLTensor::associate_memory_group(arm_compute::IMemoryGroup *memory_group)
 {
     _allocator.set_associated_memory_group(memory_group);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 2b5fbb8241..eaf46d42ca 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 namespace arm_compute
@@ -41,10 +42,10 @@ namespace
  *
  * @return A wrapped memory region
  */
-std::unique_ptr<ICLMemoryRegion> allocate_region(const cl::Context &context, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(CLCoreRuntimeContext *ctx, size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
-    std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(context,
+    std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(ctx,
                                                                                                  CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
                                                                                                  size,
                                                                                                  alignment);
@@ -52,12 +53,12 @@ std::unique_ptr<ICLMemoryRegion> allocate_region(const cl::Context &context, siz
     // Try coarse-grain SVM in case of failure
     if(region != nullptr && region->ptr() == nullptr)
     {
-        region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+        region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(ctx, CL_MEM_READ_WRITE, size, alignment);
     }
     // Try legacy buffer memory in case of failure
     if(region != nullptr && region->ptr() == nullptr)
     {
-        region = support::cpp14::make_unique<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+        region = support::cpp14::make_unique<CLBufferMemoryRegion>(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
     }
     return region;
 }
@@ -103,8 +104,8 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
 }
 } // namespace
 
-CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _scale(), _offset()
+CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext *ctx)
+    : _ctx(ctx), _owner(owner), _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _scale(), _offset()
 {
 }
 
@@ -129,7 +130,15 @@ void CLTensorAllocator::allocate()
     if(_associated_memory_group == nullptr)
     {
         // Perform memory allocation
-        _memory.set_owned_region(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
+        if(_ctx == nullptr)
+        {
+            auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
+            _memory.set_owned_region(allocate_region(&legacy_ctx, info().total_size(), 0));
+        }
+        else
+        {
+            _memory.set_owned_region(allocate_region(_ctx->core_runtime_context(), info().total_size(), 0));
+        }
     }
     else
     {
@@ -162,9 +171,17 @@ Status CLTensorAllocator::import_memory(cl::Buffer buffer)
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
 
-    _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer));
-    info().set_is_resizable(false);
+    if(_ctx == nullptr)
+    {
+        auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
+        _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, &legacy_ctx));
+    }
+    else
+    {
+        _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer, _ctx->core_runtime_context()));
+    }
 
+    info().set_is_resizable(false);
     return Status{};
 }
 
@@ -179,13 +196,28 @@ void CLTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_mem
 
 uint8_t *CLTensorAllocator::lock()
 {
-    return map(CLScheduler::get().queue(), true);
+    if(_ctx)
+    {
+        return map(_ctx->gpu_scheduler()->queue(), true);
+    }
+    else
+    {
+        return map(CLScheduler::get().queue(), true);
+    }
 }
 
 void CLTensorAllocator::unlock()
 {
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
+    if(_ctx)
+    {
+        unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
+    }
+    else
+    {
+        //Legacy singleton api
+        unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
+    }
 }
 
 uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index a1a56fd06c..fb8eba8aa4 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,21 @@
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
-ICLSimpleFunction::ICLSimpleFunction() // NOLINT
+ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
     : _kernel(),
-      _border_handler()
+      _border_handler(),
+      _ctx(ctx)
 {
 }
 
 void ICLSimpleFunction::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured");
-
-    CLScheduler::get().enqueue(_border_handler, false);
-    CLScheduler::get().enqueue(*_kernel);
+    schedule_kernel_on_ctx(_ctx, &_border_handler, false);
+    schedule_kernel_on_ctx(_ctx, _kernel.get());
 }
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 2b66795cf9..00dbb71f4c 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -25,18 +25,21 @@
 
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
 {
-CLActivationLayer::CLActivationLayer(void *ctx)
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
+    : ICLSimpleFunction(ctx)
 {
-    ARM_COMPUTE_UNUSED(ctx);
 }
 
 void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
+    auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr;
+
+    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>(core_ctx);
     k->configure(input, output, act_info);
     _kernel = std::move(k);
 }
diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp
index 66c1abdb6d..e05eb4c4ae 100644
--- a/src/runtime/GLES_COMPUTE/GCTensor.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensor.cpp
@@ -26,7 +26,7 @@
 
 namespace arm_compute
 {
-GCTensor::GCTensor()
+GCTensor::GCTensor(IRuntimeContext *)
     : _allocator(this)
 {
 }
@@ -80,4 +80,4 @@ void GCTensor::do_unmap()
 {
     _allocator.unmap();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index de08efd731..8f7ecd6ffa 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -25,7 +25,7 @@
 
 namespace arm_compute
 {
-Tensor::Tensor()
+Tensor::Tensor(IRuntimeContext *)
     : _allocator(this)
 {
 }
@@ -54,4 +54,4 @@ void Tensor::associate_memory_group(IMemoryGroup *memory_group)
 {
     _allocator.set_associated_memory_group(memory_group);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
-- 
cgit v1.2.1