From 035004e65dbffb6534ad4183cf8f95da0544fd28 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Tue, 13 Apr 2021 19:44:17 +0100
Subject: Add support for a global allocator for OpenCL tensors

Give the ability to the user to specify an allocator that can be used by
all the internal function tensors. This being a global needs to outlive
all the tensors/functions that are using it.

Resolves: COMPMID-4212, COMPMID-4213

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I251871c242879976819ebca1452404133a8e62d7
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5420
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/runtime/CL/CLTensorAllocator.h   |  10 ++-
 src/runtime/CL/CLTensorAllocator.cpp         |  18 ++++-
 tests/validation/CL/UNIT/TensorAllocator.cpp | 103 ++++++++++++++++++++++++++-
 3 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index 067c391489..1b061ee1d6 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/CL/CLMemory.h"
+#include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
@@ -127,6 +128,13 @@ public:
      */
     void set_associated_memory_group(IMemoryGroup *associated_memory_group);
 
+    /** Sets global allocator that will be used by all CLTensor objects
+     *
+     *
+     * @param[in] allocator Allocator to be used as a global allocator
+     */
+    static void set_global_allocator(IAllocator *allocator);
+
 protected:
     /** Call map() on the OpenCL buffer.
      *
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index fc789fa4b9..c82e9dfc67 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,9 +31,11 @@
 namespace arm_compute
 {
 const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer();
-
 namespace
 {
+/** Global user-defined allocator that can be used for all internal allocations of a CLTensor */
+static IAllocator *static_global_cl_allocator = nullptr;
+
 /** Helper function used to allocate the backing memory of a tensor
  *
  * @param[in] context   OpenCL context to use
@@ -130,7 +132,11 @@ void CLTensorAllocator::allocate()
     if(_associated_memory_group == nullptr)
     {
         // Perform memory allocation
-        if(_ctx == nullptr)
+        if(static_global_cl_allocator != nullptr)
+        {
+            _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0));
+        }
+        else if(_ctx == nullptr)
         {
             auto legacy_ctx = CLCoreRuntimeContext(nullptr, CLScheduler::get().context(), CLScheduler::get().queue());
             _memory.set_owned_region(allocate_region(&legacy_ctx, info().total_size(), 0));
@@ -142,6 +148,7 @@ void CLTensorAllocator::allocate()
     }
     else
     {
+        // Finalize memory management instead
         _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
     }
 
@@ -194,6 +201,11 @@ void CLTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_mem
     _associated_memory_group = associated_memory_group;
 }
 
+void CLTensorAllocator::set_global_allocator(IAllocator *allocator)
+{
+    static_global_cl_allocator = allocator;
+}
+
 uint8_t *CLTensorAllocator::lock()
 {
     if(_ctx)
diff --git a/tests/validation/CL/UNIT/TensorAllocator.cpp b/tests/validation/CL/UNIT/TensorAllocator.cpp
index 3ccdd99fe3..4ebd521bf0 100644
--- a/tests/validation/CL/UNIT/TensorAllocator.cpp
+++ b/tests/validation/CL/UNIT/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,14 @@
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 
 #include "arm_compute/core/utils/misc/MMappedFile.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/Globals.h"
 #include "tests/framework/Asserts.h"
@@ -60,12 +65,108 @@ cl_mem import_malloc_memory_helper(void *ptr, size_t size)
 
     return buf;
 }
+
+class DummyAllocator final : public IAllocator
+{
+public:
+    DummyAllocator() = default;
+
+    void *allocate(size_t size, size_t alignment) override
+    {
+        ++_n_calls;
+        return _backend_allocator.allocate(size, alignment);
+    }
+    void free(void *ptr) override
+    {
+        return _backend_allocator.free(ptr);
+    }
+    std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override
+    {
+        // Needs to be implemented as is the one that is used internally by the CLTensorAllocator
+        ++_n_calls;
+        return std::move(_backend_allocator.make_region(size, alignment));
+    }
+    int get_n_calls() const
+    {
+        return _n_calls;
+    }
+
+private:
+    int               _n_calls{};
+    CLBufferAllocator _backend_allocator{};
+};
+
+void run_conv2d(std::shared_ptr<IMemoryManager> mm, IAllocator &mm_allocator)
+{
+    // Create tensors
+    CLTensor src, weights, bias, dst;
+    src.allocator()->init(TensorInfo(TensorShape(16U, 32U, 32U, 2U), 1, DataType::F32, DataLayout::NHWC));
+    weights.allocator()->init(TensorInfo(TensorShape(16U, 3U, 3U, 32U), 1, DataType::F32, DataLayout::NHWC));
+    bias.allocator()->init(TensorInfo(TensorShape(32U), 1, DataType::F32, DataLayout::NHWC));
+    dst.allocator()->init(TensorInfo(TensorShape(32U, 32U, 32U, 2U), 1, DataType::F32, DataLayout::NHWC));
+
+    // Create and configure function
+    CLGEMMConvolutionLayer conv(mm);
+    conv.configure(&src, &weights, &bias, &dst, PadStrideInfo(1U, 1U, 1U, 1U));
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    // Finalize memory manager
+    if(mm != nullptr)
+    {
+        mm->populate(mm_allocator, 1 /* num_pools */);
+        ARM_COMPUTE_EXPECT(mm->lifetime_manager()->are_all_finalized(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(mm->pool_manager()->num_pools() == 1, framework::LogLevel::ERRORS);
+    }
+
+    conv.run();
+}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(UNIT)
 TEST_SUITE(TensorAllocator)
 
+/* Validate that an external global allocator can be used for all internal allocations */
+TEST_CASE(ExternalGlobalAllocator, framework::DatasetMode::ALL)
+{
+    DummyAllocator global_tensor_alloc;
+    CLTensorAllocator::set_global_allocator(&global_tensor_alloc);
+
+    // Run a convolution
+    run_conv2d(nullptr /* mm */, global_tensor_alloc);
+
+    // Check that allocator has been called multiple times > 4
+    ARM_COMPUTE_EXPECT(global_tensor_alloc.get_n_calls() > 4, framework::LogLevel::ERRORS);
+
+    // Nullify global allocator
+    CLTensorAllocator::set_global_allocator(nullptr);
+}
+
+/* Validate that an external global allocator can be used for the pool manager */
+TEST_CASE(ExternalGlobalAllocatorMemoryPool, framework::DatasetMode::ALL)
+{
+    auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+    auto pool_mgr     = std::make_shared<PoolManager>();
+    auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+    DummyAllocator global_tensor_alloc;
+    CLTensorAllocator::set_global_allocator(&global_tensor_alloc);
+
+    // Run a convolution
+    run_conv2d(mm, global_tensor_alloc);
+
+    // Check that allocator has been called multiple times > 4
+    ARM_COMPUTE_EXPECT(global_tensor_alloc.get_n_calls() > 4, framework::LogLevel::ERRORS);
+
+    // Nullify global allocator
+    CLTensorAllocator::set_global_allocator(nullptr);
+}
+
 /** Validates import memory interface when importing cl buffer objects */
 TEST_CASE(ImportMemoryBuffer, framework::DatasetMode::ALL)
 {
-- 
cgit v1.2.1