From c1c872f12797ef6fe52c4589113e7efc353e56eb Mon Sep 17 00:00:00 2001
From: Jan Eilers <jan.eilers@arm.com>
Date: Thu, 22 Jul 2021 13:17:04 +0100
Subject: Adds CustomAllocator interface and Sample App

 * Updates the runtime options with a CustomAllocatorMap which allows to define a CustomAllocator for specific backends
 * Change IBackendInternal interface to use a shared pointer to a custom allocator
 * Update ClBackend.hpp/cpp to use the CustomAllocator
 * Adds an example application and unit test which uses a CustomAllocator for GpuAcc
 * Refactor of the interface to use MemorySource instead of the user Mapping cl_mem directly
 * Modify the BackendRegistry to also hold a registry of CustomAllocators
 * BackendRegistry Deregister will also deregister any allocators associated with that backend id
 * set_global_allocator within the BaseMemoryManager so that it always matches the currently used allocator

Signed-off-by: Jan Eilers <jan.eilers@arm.com>
Change-Id: I156d819686021865f4375e6cb7a5c5dec8fee9e8
Signed-off-by: David Monahan <david.monahan@arm.com>
---
 include/armnn/BackendRegistry.hpp                  |   5 +
 include/armnn/IRuntime.hpp                         |  20 ++-
 include/armnn/backends/IBackendInternal.hpp        |   5 +-
 include/armnn/backends/ICustomAllocator.hpp        |  18 ++-
 samples/CMakeLists.txt                             |   5 +
 samples/CustomMemoryAllocatorSample.cpp            | 175 +++++++++++++++++++++
 src/armnn/BackendRegistry.cpp                      |  21 +++
 src/armnn/Runtime.cpp                              |  64 +++++++-
 src/armnn/test/OptimizerTests.cpp                  | 133 +---------------
 src/backends/aclCommon/BaseMemoryManager.cpp       |   2 +-
 src/backends/aclCommon/BaseMemoryManager.hpp       |   8 +-
 .../backendsCommon/test/CompatibilityTests.cpp     |   4 +
 src/backends/cl/ClBackend.cpp                      |  62 ++++++--
 src/backends/cl/ClBackend.hpp                      | 128 ++++++++++++++-
 src/backends/cl/ClImportTensorHandle.hpp           |  10 +-
 src/backends/cl/ClRegistryInitializer.cpp          |   8 +
 src/backends/cl/test/CMakeLists.txt                |   1 +
 src/backends/cl/test/ClCustomAllocatorTests.cpp    | 160 +++++++++++++++++++
 src/backends/cl/test/ClImportTensorHandleTests.cpp |   2 +-
 19 files changed, 658 insertions(+), 173 deletions(-)
 create mode 100644 samples/CustomMemoryAllocatorSample.cpp
 create mode 100644 src/backends/cl/test/ClCustomAllocatorTests.cpp
diff --git a/include/armnn/BackendRegistry.hpp b/include/armnn/BackendRegistry.hpp
index fe6451cde0..c13aa9f8b6 100644
--- a/include/armnn/BackendRegistry.hpp
+++ b/include/armnn/BackendRegistry.hpp
@@ -7,6 +7,7 @@
 #include <armnn/Types.hpp>
 #include <armnn/BackendId.hpp>
 #include <armnn/Optional.hpp>
+#include <armnn/backends/ICustomAllocator.hpp>
 
 #include <memory>
 #include <unordered_map>
@@ -35,6 +36,8 @@ public:
     BackendIdSet GetBackendIds() const;
     std::string GetBackendIdsAsString() const;
     void SetProfilingService(armnn::Optional<profiling::ProfilingService&> profilingService);
+    void RegisterAllocator(const BackendId& id, std::shared_ptr<ICustomAllocator> alloc);
+    std::unordered_map<BackendId, std::shared_ptr<ICustomAllocator>> GetAllocators();
 
     BackendRegistry() {}
     virtual ~BackendRegistry() {}
@@ -50,6 +53,7 @@ public:
     };
 
     void Deregister(const BackendId& id);
+    void DeregisterAllocator(const BackendId &id);
 
 protected:
     using FactoryStorage = std::unordered_map<BackendId, FactoryFunction>;
@@ -63,6 +67,7 @@ private:
 
     FactoryStorage m_Factories;
     armnn::Optional<profiling::ProfilingService&> m_ProfilingService;
+    std::unordered_map<BackendId, std::shared_ptr<ICustomAllocator>> m_CustomMemoryAllocatorMap;
 };
 
 BackendRegistry& BackendRegistryInstance();
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp
index 8c269dee49..97a9c2889e 100644
--- a/include/armnn/IRuntime.hpp
+++ b/include/armnn/IRuntime.hpp
@@ -16,6 +16,7 @@
 
 #include <armnn/backends/ICustomAllocator.hpp>
 #include <memory>
+#include <map>
 
 namespace armnn
 {
@@ -103,8 +104,8 @@ public:
             : m_GpuAccTunedParameters(nullptr)
             , m_EnableGpuProfiling(false)
             , m_DynamicBackendsPath("")
-            , m_CustomAllocator(nullptr)
             , m_ProtectedMode(false)
+            , m_CustomAllocatorMap()
         {}
 
         /// If set, uses the GpuAcc tuned parameters from the given object when executing GPU workloads.
@@ -118,17 +119,22 @@ public:
         /// Only a single path is allowed for the override
         std::string m_DynamicBackendsPath;
 
-        /// A Custom Allocator used for allocation of working memory in the backends.
-        /// Set this for when you need to allocate Protected Working Memory, required for ProtectedMode
-        /// Only supported for GpuAcc
-        ICustomAllocator* m_CustomAllocator;
-
         /// Setting this flag will allow the user to create the Runtime in protected mode.
         /// It will run all the inferences on protected memory and will make sure that
         /// INetworkProperties::m_ImportEnabled set to true with MemorySource::DmaBufProtected option
-        /// This will use Protected Memory Allocator associated with the backend
+        /// This requires that the backend supports Protected Memory and has an allocator capable of
+        /// allocating Protected Memory associated with it.
         bool m_ProtectedMode;
 
+        /// @brief A map to define a custom memory allocator for specific backend Ids.
+        ///
+        /// @details  A Custom Allocator is used for allocation of working memory in the backends.
+        /// Set this if you need to take control of how memory is allocated on a backend. Required for
+        /// Protected Mode in order to correctly allocate Protected Memory
+        ///
+        /// @note Only supported for GpuAcc
+        std::map<BackendId, std::shared_ptr<ICustomAllocator>> m_CustomAllocatorMap;
+
         struct ExternalProfilingOptions
         {
             ExternalProfilingOptions()
diff --git a/include/armnn/backends/IBackendInternal.hpp b/include/armnn/backends/IBackendInternal.hpp
index 3b4ef95703..626746465f 100644
--- a/include/armnn/backends/IBackendInternal.hpp
+++ b/include/armnn/backends/IBackendInternal.hpp
@@ -199,10 +199,13 @@ public:
 
     /// Signals the backend to use a custom memory allocator provided by the user
     ///
+    /// \param allocator - a pointer to the provided ICustomAllocator to use with this backend
     /// \param errMsg - Optional string variable to return error messages
     /// \return - Returns true if switching to custom allocator was successful
-    virtual bool UseCustomMemoryAllocator(armnn::Optional<std::string&> errMsg)
+    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+                                          armnn::Optional<std::string&> errMsg)
     {
+        IgnoreUnused(allocator);
         if (errMsg)
         {
             std::stringstream message;
diff --git a/include/armnn/backends/ICustomAllocator.hpp b/include/armnn/backends/ICustomAllocator.hpp
index 1d4df0cb86..92cbcc2641 100644
--- a/include/armnn/backends/ICustomAllocator.hpp
+++ b/include/armnn/backends/ICustomAllocator.hpp
@@ -7,6 +7,7 @@
 
 #include <cstddef>
 #include <memory>
+#include <armnn/MemorySources.hpp>
 
 namespace armnn
 {
@@ -23,13 +24,20 @@ public:
      * @param[in] alignment Alignment that the returned pointer should comply with
      *
      * @return A pointer to the allocated memory
+     * The returned pointer must be host write accessible
      */
-    virtual void *allocate(size_t size, size_t alignment) = 0;
-    /** Interface to be implemented by the child class to free the allocated tensor */
-    virtual void free(void *ptr) = 0;
+    virtual void* allocate(size_t size, size_t alignment) = 0;
 
-    // Utility Function to define the Custom Memory Allocators capabilities
-    virtual bool SupportsProtectedMemory() = 0;
+    /** Interface to be implemented by the child class to free the allocated bytes */
+    virtual void free(void* ptr) = 0;
+
+    //  Used to specify what type of memory is being allocated by this allocator.
+    //  Supported types are:
+    //      MemorySource::Malloc
+    //  Unsupported types are:
+    //      MemorySource::DmaBuf
+    //      MemorySource::DmaBufProtected
+    virtual armnn::MemorySource GetMemorySourceType() = 0;
 
 };
 } // namespace armnn
\ No newline at end of file
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index ff45eecbe0..7be6a69369 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -8,3 +8,8 @@ if(BUILD_SAMPLE_APP AND SAMPLE_DYNAMIC_BACKEND)
     target_link_libraries(DynamicSample armnn ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
+if(BUILD_SAMPLE_APP AND ARMCOMPUTECL)
+    add_executable(CustomMemoryAllocatorSample CustomMemoryAllocatorSample.cpp)
+    target_link_libraries(CustomMemoryAllocatorSample armnn ${CMAKE_THREAD_LIBS_INIT})
+endif()
+
diff --git a/samples/CustomMemoryAllocatorSample.cpp b/samples/CustomMemoryAllocatorSample.cpp
new file mode 100644
index 0000000000..51b3c81079
--- /dev/null
+++ b/samples/CustomMemoryAllocatorSample.cpp
@@ -0,0 +1,175 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/backends/ICustomAllocator.hpp>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <iostream>
+
+/** Sample implementation of ICustomAllocator for use with the ClBackend.
+ *  Note: any memory allocated must be host addressable with write access
+ *  in order for ArmNN to be able to properly use it. */
+class SampleClBackendCustomAllocator : public armnn::ICustomAllocator
+{
+public:
+    SampleClBackendCustomAllocator() = default;
+
+    void* allocate(size_t size, size_t alignment)
+    {
+        // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment
+        if (alignment == 0)
+        {
+            alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+        }
+        size_t space = size + alignment + alignment;
+        auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
+
+        if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
+        {
+            throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed");
+        }
+        return allocatedMemPtr;
+    }
+
+    void free(void* ptr)
+    {
+        std::free(ptr);
+    }
+
+    armnn::MemorySource GetMemorySourceType()
+    {
+        return armnn::MemorySource::Malloc;
+    }
+};
+
+
+// A simple example application to show the usage of a custom memory allocator. In this sample, the users single
+// input number is multiplied by 1.0f using a fully connected layer with a single neuron to produce an output
+// number that is the same as the input. All memory required to execute this mini network is allocated with
+// the provided custom allocator.
+//
+// Using a Custom Allocator is required for use with Protected Mode and Protected Memory.
+// This example is provided using only unprotected malloc as Protected Memory is platform
+// and implementation specific.
+//
+// Note: This example is similar to the SimpleSample application that can also be found in armnn/samples.
+//       The differences are in the use of a custom allocator, the backend is GpuAcc, and the inputs/outputs
+//       are being imported instead of copied. (Import must be enabled when using a Custom Allocator)
+//       You might find this useful for comparison.
+int main()
+{
+    using namespace armnn;
+
+    float number;
+    std::cout << "Please enter a number: " << std::endl;
+    std::cin >> number;
+
+    // Turn on logging to standard output
+    // This is useful in this sample so that users can learn more about what is going on
+    armnn::ConfigureLogging(true, false, LogSeverity::Info);
+
+    // Construct ArmNN network
+    armnn::NetworkId networkIdentifier;
+    INetworkPtr myNetwork = INetwork::Create();
+    armnn::FullyConnectedDescriptor fullyConnectedDesc;
+    float weightsData[] = {1.0f}; // Identity
+    TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32);
+    weightsInfo.SetConstant(true);
+    armnn::ConstTensor weights(weightsInfo, weightsData);
+    ARMNN_NO_DEPRECATE_WARN_BEGIN
+    IConnectableLayer *fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc,
+                                                                          weights,
+                                                                          EmptyOptional(),
+                                                                          "fully connected");
+    ARMNN_NO_DEPRECATE_WARN_END
+    IConnectableLayer *InputLayer = myNetwork->AddInputLayer(0);
+    IConnectableLayer *OutputLayer = myNetwork->AddOutputLayer(0);
+    InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0));
+    fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
+
+    // Create ArmNN runtime:
+    //
+    // This is the interesting bit when executing a model with a custom allocator.
+    // You can have different allocators for different backends. To support this
+    // the runtime creation option has a map that takes a BackendId and the corresponding
+    // allocator that should be used for that backend.
+    // Only GpuAcc supports a Custom Allocator for now
+    //
+    // Note: This is not covered in this example but if you want to run a model on
+    //       protected memory a custom allocator needs to be provided that supports
+    //       protected memory allocations and the MemorySource of that allocator is
+    //       set to MemorySource::DmaBufProtected
+    IRuntime::CreationOptions options;
+    auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
+    IRuntimePtr runtime = IRuntime::Create(options);
+
+    //Set the tensors in the network.
+    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
+
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+    // Optimise ArmNN network
+    OptimizerOptions optOptions;
+    optOptions.m_ImportEnabled = true;
+    armnn::IOptimizedNetworkPtr optNet =
+                Optimize(*myNetwork, {"GpuAcc"}, runtime->GetDeviceSpec(), optOptions);
+    if (!optNet)
+    {
+        // This shouldn't happen for this simple sample, with GpuAcc backend.
+        // But in general usage Optimize could fail if the backend at runtime cannot
+        // support the model that has been provided.
+        std::cerr << "Error: Failed to optimise the input network." << std::endl;
+        return 1;
+    }
+
+    // Load graph into runtime
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
+    runtime->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    const size_t alignment =
+            arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+
+    // Input with negative values
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+
+    void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+    auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
+    std::fill_n(outputPtr, numElements, -10.0f);
+
+
+    armnn::InputTensors inputTensors
+    {
+        {0, armnn::ConstTensor(runtime->GetInputTensorInfo(networkIdentifier, 0), alignedInputPtr)},
+    };
+    armnn::OutputTensors outputTensors
+    {
+        {0, armnn::Tensor(runtime->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
+    };
+
+    // Execute network
+    runtime->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
+
+    // Tell the CLBackend to sync memory so we can read the output.
+    arm_compute::CLScheduler::get().sync();
+    auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+    std::cout << "Your number was " << outputResult[0] << std::endl;
+    runtime->UnloadNetwork(networkIdentifier);
+    return 0;
+
+}
diff --git a/src/armnn/BackendRegistry.cpp b/src/armnn/BackendRegistry.cpp
index ff63c8236a..80daed9896 100644
--- a/src/armnn/BackendRegistry.cpp
+++ b/src/armnn/BackendRegistry.cpp
@@ -39,6 +39,7 @@ void BackendRegistry::Register(const BackendId& id, BackendRegistry::FactoryFunc
 void BackendRegistry::Deregister(const BackendId& id)
 {
     m_Factories.erase(id);
+    DeregisterAllocator(id);
 
     if (m_ProfilingService.has_value() && m_ProfilingService.value().IsProfilingEnabled())
     {
@@ -106,5 +107,25 @@ void BackendRegistry::SetProfilingService(armnn::Optional<profiling::ProfilingSe
     m_ProfilingService = profilingService;
 }
 
+void BackendRegistry::RegisterAllocator(const BackendId& id, std::shared_ptr<ICustomAllocator> alloc)
+{
+    if (m_CustomMemoryAllocatorMap.find(id) != m_CustomMemoryAllocatorMap.end())
+    {
+        throw InvalidArgumentException(
+            std::string(id) + " already has an allocator associated with it",
+            CHECK_LOCATION());
+    }
+    m_CustomMemoryAllocatorMap[id] = alloc;
+}
+
+void BackendRegistry::DeregisterAllocator(const BackendId& id)
+{
+    m_CustomMemoryAllocatorMap.erase(id);
+}
+
+std::unordered_map<BackendId, std::shared_ptr<ICustomAllocator>> BackendRegistry::GetAllocators()
+{
+    return m_CustomMemoryAllocatorMap;
+}
 
 } // namespace armnn
diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp
index c2b748653d..9fe58287c3 100644
--- a/src/armnn/Runtime.cpp
+++ b/src/armnn/Runtime.cpp
@@ -130,7 +130,8 @@ Status RuntimeImpl::LoadNetwork(NetworkId& networkIdOut,
                                 IOptimizedNetworkPtr inNetwork,
                                 std::string& errorMessage)
 {
-    INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
+    INetworkProperties networkProperties(
+            false, MemorySource::Undefined, MemorySource::Undefined);
     return LoadNetwork(networkIdOut, std::move(inNetwork), errorMessage, networkProperties);
 }
 
@@ -267,7 +268,8 @@ RuntimeImpl::RuntimeImpl(const IRuntime::CreationOptions& options)
 
     if ( options.m_ProfilingOptions.m_TimelineEnabled && !options.m_ProfilingOptions.m_EnableProfiling )
     {
-        throw RuntimeException("It is not possible to enable timeline reporting without profiling being enabled");
+        throw RuntimeException(
+                "It is not possible to enable timeline reporting without profiling being enabled");
     }
 
     // Load any available/compatible dynamic backend before the runtime
@@ -283,6 +285,8 @@ RuntimeImpl::RuntimeImpl(const IRuntime::CreationOptions& options)
             auto backend = factoryFun();
             ARMNN_ASSERT(backend.get() != nullptr);
 
+            auto customAllocatorMapIterator = options.m_CustomAllocatorMap.find(id);
+
             // If the runtime is created in protected mode only add backends that support this mode
             if (options.m_ProtectedMode)
             {
@@ -298,17 +302,61 @@ RuntimeImpl::RuntimeImpl(const IRuntime::CreationOptions& options)
                                        << " is not registered as does not support protected content allocation \n";
                     continue;
                 }
-                std::string err;
-                if (!backend->UseCustomMemoryAllocator(err))
+                // The user is responsible to provide a custom memory allocator which allows to allocate
+                // protected memory
+                if (customAllocatorMapIterator != options.m_CustomAllocatorMap.end())
                 {
-                    ARMNN_LOG(error) << "The backend "
+                    std::string err;
+                    if (customAllocatorMapIterator->second->GetMemorySourceType()
+                        == armnn::MemorySource::DmaBufProtected)
+                    {
+                        if (!backend->UseCustomMemoryAllocator(customAllocatorMapIterator->second, err))
+                        {
+                            ARMNN_LOG(error) << "The backend "
+                                             << id
+                                             << " reported an error when entering protected mode. Backend won't be"
+                                             << " used. ErrorMsg: " << err;
+                            continue;
+                        }
+                        // No errors so register the Custom Allocator with the BackendRegistry
+                        BackendRegistryInstance().RegisterAllocator(id, customAllocatorMapIterator->second);
+                    }
+                    else
+                    {
+                        ARMNN_LOG(error) << "The CustomAllocator provided with the runtime options doesn't support "
+                                     "protected memory. Protected mode can't be activated. The backend "
                                      << id
-                                     << " reported an error when entering protected mode. Backend won't be used."
-                                     << " ErrorMsg: " << err;
+                                     << " is not going to be used. MemorySource must be MemorySource::DmaBufProtected";
+                        continue;
+                    }
+                }
+                else
+                {
+                    ARMNN_LOG(error) << "Protected mode can't be activated for backend: "
+                                     << id
+                                     << " no custom allocator was provided to the runtime options.";
                     continue;
                 }
             }
-
+            else
+            {
+                // If a custom memory allocator is provided make the backend use that instead of the default
+                if (customAllocatorMapIterator != options.m_CustomAllocatorMap.end())
+                {
+                    std::string err;
+                    if (!backend->UseCustomMemoryAllocator(customAllocatorMapIterator->second, err))
+                    {
+                        ARMNN_LOG(error) << "The backend "
+                                         << id
+                                         << " reported an error when trying to use the provided custom allocator."
+                                            " Backend won't be used."
+                                         << " ErrorMsg: " << err;
+                        continue;
+                    }
+                    // No errors so register the Custom Allocator with the BackendRegistry
+                    BackendRegistryInstance().RegisterAllocator(id, customAllocatorMapIterator->second);
+                }
+            }
             auto context = backend->CreateBackendContext(options);
 
             // backends are allowed to return nullptrs if they
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index 19bd58193a..38aef671d2 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -220,9 +220,10 @@ public:
         return m_BackendCapabilities;
     };
 
-    virtual bool UseCustomMemoryAllocator(armnn::Optional<std::string&> errMsg) override
+    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+                                          armnn::Optional<std::string&> errMsg) override
     {
-        IgnoreUnused(errMsg);
+        IgnoreUnused(errMsg, allocator);
         m_CustomAllocator = true;
         return m_CustomAllocator;
     }
@@ -925,131 +926,3 @@ TEST_CASE("OptimizeForExclusiveConnectionsWithoutFuseTest")
                              &IsLayerOfType<armnn::OutputLayer>));
 }
 } // Optimizer TestSuite
-
-TEST_SUITE("Runtime")
-{
-// This test really belongs into RuntimeTests.cpp but it requires all sort of MockBackends which are
-// already defined here
-TEST_CASE("RuntimeProtectedModeOption")
-{
-    using namespace armnn;
-
-    struct MockPolicy
-    {
-        static const BackendId& GetIdStatic()
-        {
-            static BackendId id = "MockBackend";
-            return id;
-        }
-    };
-
-    struct ProtectedPolicy
-    {
-        static const BackendId& GetIdStatic()
-        {
-            static BackendId id = "MockBackendProtectedContent";
-            return id;
-        }
-    };
-
-    struct SillyPolicy
-    {
-        static const BackendId& GetIdStatic()
-        {
-            static BackendId id = "SillyMockBackend";
-            return id;
-        }
-    };
-
-    BackendCapabilities mockBackendCapabilities("MockBackend",
-                                                {
-                                                        {"ProtectedContentAllocation", false}
-                                                });
-    BackendCapabilities mockProtectedBackendCapabilities("MockBackendProtectedContent",
-                                                         {
-                                                                 {"ProtectedContentAllocation", true}
-                                                         });
-
-    auto& backendRegistry = BackendRegistryInstance();
-
-    // clean up from previous test runs
-    std::vector<BackendId> mockBackends = {"MockBackend", "MockBackendProtectedContent", "SillyMockBackend"};
-    for (auto& backend : mockBackends)
-    {
-        backendRegistry.Deregister(backend);
-    }
-
-    // Create a bunch of MockBackends with different capabilities
-    // 1. Doesn't support protected mode even though it knows about this capability
-    backendRegistry.Register("MockBackend", [mockBackendCapabilities]()
-    {
-        return std::make_unique<MockBackend<MockPolicy>>(mockBackendCapabilities);
-    });
-    // 2. Supports protected mode and has it implemented correctly
-    backendRegistry.Register("MockBackendProtectedContent", [mockProtectedBackendCapabilities]()
-    {
-        return std::make_unique<MockBackend<ProtectedPolicy>>(mockProtectedBackendCapabilities);
-    });
-    // 3. Claims to support protected mode but doesn't have the UseCustomMemoryAllocator function implemented
-    backendRegistry.Register("SillyMockBackend", [mockProtectedBackendCapabilities]()
-    {
-        return std::make_unique<NoProtectedModeMockBackend<SillyPolicy>>(mockProtectedBackendCapabilities);
-    });
-
-    // Creates a runtime that is not in protected mode
-    {
-        IRuntime::CreationOptions creationOptions;
-        creationOptions.m_ProtectedMode = false;
-
-        IRuntimePtr run = IRuntime::Create(creationOptions);
-
-        const armnn::BackendIdSet supportedDevices = run->GetDeviceSpec().GetSupportedBackends();
-        // Both MockBackends that are registered should show up in the runtimes supported backends list
-        for (auto& backend : mockBackends)
-        {
-            CHECK(std::find(supportedDevices.cbegin(), supportedDevices.cend(), backend) != supportedDevices.cend());
-        }
-    }
-
-    // If the runtime is in protected mode only backends that support protected content should be added
-    {
-        IRuntime::CreationOptions creationOptions;
-        creationOptions.m_ProtectedMode = true;
-
-        IRuntimePtr run = IRuntime::Create(creationOptions);
-
-        const armnn::BackendIdSet supportedDevices = run->GetDeviceSpec().GetSupportedBackends();
-        // Only the MockBackends that claims support for protected content should show up in the
-        // runtimes supported backends list
-        CHECK(std::find(supportedDevices.cbegin(),
-                        supportedDevices.cend(),
-                        "MockBackendProtectedContent") != supportedDevices.cend());
-        CHECK(std::find(supportedDevices.cbegin(),
-                        supportedDevices.cend(),
-                        "MockBackend") == supportedDevices.cend());
-        CHECK(std::find(supportedDevices.cbegin(),
-                        supportedDevices.cend(),
-                        "SillyMockBackend") == supportedDevices.cend());
-    }
-
-    // If the runtime is in protected mode only backends that support protected content should be added
-    {
-        IRuntime::CreationOptions creationOptions;
-        creationOptions.m_ProtectedMode = true;
-
-        IRuntimePtr run = IRuntime::Create(creationOptions);
-
-        const armnn::BackendIdSet supportedDevices = run->GetDeviceSpec().GetSupportedBackends();
-        // Only the MockBackend that claims support for protected content should show up in the
-        // runtimes supported backends list
-                CHECK(std::find(supportedDevices.cbegin(),
-                                supportedDevices.cend(),
-                                "MockBackendProtectedContent") != supportedDevices.cend());
-
-                CHECK(std::find(supportedDevices.cbegin(),
-                                supportedDevices.cend(),
-                                "MockBackend") == supportedDevices.cend());
-    }
-
-}
-}
diff --git a/src/backends/aclCommon/BaseMemoryManager.cpp b/src/backends/aclCommon/BaseMemoryManager.cpp
index 45e0480a84..c60a4a04ae 100644
--- a/src/backends/aclCommon/BaseMemoryManager.cpp
+++ b/src/backends/aclCommon/BaseMemoryManager.cpp
@@ -15,7 +15,7 @@ namespace armnn
 {
 
 #if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
-BaseMemoryManager::BaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc,
+BaseMemoryManager::BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc,
                                      MemoryAffinity memoryAffinity)
 {
     ARMNN_ASSERT(alloc);
diff --git a/src/backends/aclCommon/BaseMemoryManager.hpp b/src/backends/aclCommon/BaseMemoryManager.hpp
index e80abf0edd..e3ffd188a1 100644
--- a/src/backends/aclCommon/BaseMemoryManager.hpp
+++ b/src/backends/aclCommon/BaseMemoryManager.hpp
@@ -15,6 +15,7 @@
 #include <arm_compute/runtime/IAllocator.h>
 #include <arm_compute/runtime/IMemoryGroup.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
+#include <arm_compute/runtime/CL/CLTensorAllocator.h>
 #endif
 
 namespace armnn
@@ -36,14 +37,14 @@ public:
     void Release() override;
 
 #if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
-    BaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc, MemoryAffinity memoryAffinity);
+    BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc, MemoryAffinity memoryAffinity);
 
     std::shared_ptr<arm_compute::MemoryManagerOnDemand>& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; }
     std::shared_ptr<arm_compute::MemoryManagerOnDemand>& GetInterLayerManager() { return m_InterLayerMemoryMgr; }
     std::shared_ptr<arm_compute::IMemoryGroup>& GetInterLayerMemoryGroup()      { return m_InterLayerMemoryGroup; }
 
 protected:
-    std::unique_ptr<arm_compute::IAllocator>            m_Allocator;
+    std::shared_ptr<arm_compute::IAllocator>            m_Allocator;
     std::shared_ptr<arm_compute::MemoryManagerOnDemand> m_IntraLayerMemoryMgr;
     std::shared_ptr<arm_compute::MemoryManagerOnDemand> m_InterLayerMemoryMgr;
     std::shared_ptr<arm_compute::IMemoryGroup>          m_InterLayerMemoryGroup;
@@ -81,9 +82,10 @@ public:
     ClMemoryManager() {}
     virtual ~ClMemoryManager() {}
 
-    ClMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc)
+    ClMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc)
     : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer)
     {
+        arm_compute::CLTensorAllocator::set_global_allocator(alloc.get());
         m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr);
     }
 
diff --git a/src/backends/backendsCommon/test/CompatibilityTests.cpp b/src/backends/backendsCommon/test/CompatibilityTests.cpp
index 12cb5e9956..34baad9d0c 100644
--- a/src/backends/backendsCommon/test/CompatibilityTests.cpp
+++ b/src/backends/backendsCommon/test/CompatibilityTests.cpp
@@ -3,8 +3,12 @@
 // SPDX-License-Identifier: MIT
 //
 
+#if defined(ARMCOMPUTECL_ENABLED)
 #include <cl/ClBackend.hpp>
+#endif
+#if defined(ARMCOMPUTENEON_ENABLED)
 #include <neon/NeonBackend.hpp>
+#endif
 #include <reference/RefBackend.hpp>
 #include <armnn/BackendHelper.hpp>
 
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index f1e52c1998..b85232e75c 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -49,6 +49,10 @@ const BackendId& ClBackend::GetIdStatic()
 
 IBackendInternal::IMemoryManagerUniquePtr ClBackend::CreateMemoryManager() const
 {
+    if (m_UsingCustomAllocator)
+    {
+        return std::make_unique<ClMemoryManager>(m_CustomAllocator);
+    }
     return std::make_unique<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
 }
 
@@ -69,7 +73,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
     TensorHandleFactoryRegistry& registry) const
 {
-    auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
@@ -83,7 +95,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
     TensorHandleFactoryRegistry& registry, const ModelOptions& modelOptions) const
 {
-    auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
@@ -100,7 +120,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
     MemorySourceFlags inputFlags,
     MemorySourceFlags outputFlags) const
 {
-    auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
@@ -118,10 +146,18 @@ std::vector<ITensorHandleFactory::FactoryId> ClBackend::GetHandleFactoryPreferen
 
 void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
 {
-    auto mgr = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
-    registry.RegisterMemoryManager(mgr);
-    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr));
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
     registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(
         static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc)));
 }
@@ -130,10 +166,18 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis
                                               MemorySourceFlags inputFlags,
                                               MemorySourceFlags outputFlags)
 {
-    auto mgr = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
-    registry.RegisterMemoryManager(mgr);
-    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr));
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
     registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(inputFlags, outputFlags));
 }
 
diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp
index c742c0b204..c63bd25c56 100644
--- a/src/backends/cl/ClBackend.hpp
+++ b/src/backends/cl/ClBackend.hpp
@@ -6,6 +6,15 @@
 
 #include <armnn/backends/IBackendInternal.hpp>
 
+#include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+
+#include <aclCommon/BaseMemoryManager.hpp>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+
 namespace armnn
 {
 
@@ -20,7 +29,12 @@ const BackendCapabilities gpuAccCapabilities("GpuAcc",
 class ClBackend : public IBackendInternal
 {
 public:
-    ClBackend() : m_EnableCustomAllocator(false) {};
+    ClBackend() : m_CustomAllocator(nullptr) {};
+    ClBackend(std::shared_ptr<ICustomAllocator> allocator)
+    {
+        std::string err;
+        UseCustomMemoryAllocator(allocator, err);
+    }
     ~ClBackend() = default;
 
     static const BackendId& GetIdStatic();
@@ -72,17 +86,119 @@ public:
         return gpuAccCapabilities;
     };
 
-    virtual bool UseCustomMemoryAllocator(armnn::Optional<std::string&> errMsg) override
+    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+                                          armnn::Optional<std::string&> errMsg) override
     {
         IgnoreUnused(errMsg);
+        ARMNN_LOG(info) << "Using Custom Allocator for ClBackend";
 
         // Set flag to signal the backend to use a custom memory allocator
-        m_EnableCustomAllocator = true;
-
-        return m_EnableCustomAllocator;
+        m_CustomAllocator = std::make_shared<ClBackendCustomAllocatorWrapper>(std::move(allocator));
+        m_UsingCustomAllocator = true;
+        return m_UsingCustomAllocator;
     }
 
-    bool m_EnableCustomAllocator;
+    // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
+    class ClBackendCustomAllocatorWrapper : public arm_compute::IAllocator
+    {
+    public:
+        ClBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
+        {}
+        // Inherited methods overridden:
+        void* allocate(size_t size, size_t alignment) override
+        {
+            auto alloc = m_CustomAllocator->allocate(size, alignment);
+            return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
+        }
+        void free(void* ptr) override
+        {
+            auto hostMemPtr = m_AllocatedBufferMappings[ptr];
+            clReleaseMemObject(static_cast<cl_mem>(ptr));
+            m_CustomAllocator->free(hostMemPtr);
+        }
+        std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
+        {
+            auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
+            cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
+
+            return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer), hostMemPtr);
+        }
+    private:
+        cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
+        {
+            // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+            auto cachelineAlignment =
+                    arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+            auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
+
+            if (source == MemorySource::Malloc)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                        CL_IMPORT_TYPE_ARM,
+                        CL_IMPORT_TYPE_HOST_ARM,
+                        0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                    "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
+            }
+            throw armnn::Exception(
+                    "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
+        }
+        std::shared_ptr<ICustomAllocator> m_CustomAllocator;
+        std::map<void*, void*> m_AllocatedBufferMappings;
+    };
+
+    class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
+    {
+    public:
+        // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
+        ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr)
+            : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+        {
+            _mem = buffer;
+            m_HostMemPtr = hostMemPtr;
+        }
+
+        // Inherited methods overridden :
+        void* ptr() override
+        {
+            return nullptr;
+        }
+
+        void* map(cl::CommandQueue &q, bool blocking) override
+        {
+            armnn::IgnoreUnused(q, blocking);
+            if (m_HostMemPtr == nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
+            }
+            _mapping = m_HostMemPtr;
+            return _mapping;
+        }
+
+        void unmap(cl::CommandQueue &q) override
+        {
+            armnn::IgnoreUnused(q);
+            _mapping = nullptr;
+        }
+        void* m_HostMemPtr = nullptr;
+    };
+
+    std::shared_ptr<ClBackendCustomAllocatorWrapper> m_CustomAllocator;
+    bool m_UsingCustomAllocator = false;
 };
 
 } // namespace armnn
diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp
index 3fca7cb127..69cd4a6d81 100644
--- a/src/backends/cl/ClImportTensorHandle.hpp
+++ b/src/backends/cl/ClImportTensorHandle.hpp
@@ -140,10 +140,16 @@ public:
 private:
     bool ClImport(const cl_import_properties_arm* importProperties, void* memory)
     {
-        const size_t totalBytes = m_Tensor.info()->total_size();
+        size_t totalBytes = m_Tensor.info()->total_size();
+
+        // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+        auto cachelineAlignment =
+                arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+        auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+
         cl_int error = CL_SUCCESS;
         cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
-                                          CL_MEM_READ_WRITE, importProperties, memory, totalBytes, &error);
+                                          CL_MEM_READ_WRITE, importProperties, memory, roundedSize, &error);
         if (error != CL_SUCCESS)
         {
             throw MemoryImportException("ClImportTensorHandle::Invalid imported memory" + std::to_string(error));
diff --git a/src/backends/cl/ClRegistryInitializer.cpp b/src/backends/cl/ClRegistryInitializer.cpp
index 8decd6f689..aadc14bd68 100644
--- a/src/backends/cl/ClRegistryInitializer.cpp
+++ b/src/backends/cl/ClRegistryInitializer.cpp
@@ -18,6 +18,14 @@ static BackendRegistry::StaticRegistryInitializer g_RegisterHelper
     ClBackend::GetIdStatic(),
     []()
     {
+        // Check if we have a CustomMemoryAllocator associated with the backend
+        // and if so register it with the backend.
+        auto customAllocators = BackendRegistryInstance().GetAllocators();
+        auto allocatorIterator = customAllocators.find(ClBackend::GetIdStatic());
+        if (allocatorIterator != customAllocators.end())
+        {
+            return IBackendInternalUniquePtr(new ClBackend(allocatorIterator->second));
+        }
         return IBackendInternalUniquePtr(new ClBackend);
     }
 };
diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt
index 6662a1e659..41cbe24c15 100644
--- a/src/backends/cl/test/CMakeLists.txt
+++ b/src/backends/cl/test/CMakeLists.txt
@@ -6,6 +6,7 @@
 list(APPEND armnnClBackendUnitTests_sources
     ClContextControlFixture.hpp
     ClContextSerializerTests.cpp
+    ClCustomAllocatorTests.cpp
     ClCreateWorkloadTests.cpp
     ClEndToEndTests.cpp
     ClImportTensorHandleFactoryTests.cpp
diff --git a/src/backends/cl/test/ClCustomAllocatorTests.cpp b/src/backends/cl/test/ClCustomAllocatorTests.cpp
new file mode 100644
index 0000000000..4d1a0e1cfb
--- /dev/null
+++ b/src/backends/cl/test/ClCustomAllocatorTests.cpp
@@ -0,0 +1,160 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/backends/ICustomAllocator.hpp>
+#include <armnn/Descriptors.hpp>
+#include <armnn/Exceptions.hpp>
+#include <armnn/INetwork.hpp>
+#include <armnn/IRuntime.hpp>
+#include <armnn/Utils.hpp>
+#include <armnn/BackendRegistry.hpp>
+#include <cl/ClBackend.hpp>
+
+#include <doctest/doctest.h>
+
+// Contains the OpenCl interfaces for mapping memory in the Gpu Page Tables
+// Requires the OpenCl backend to be included (GpuAcc)
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+
+/** Sample implementation of ICustomAllocator for use with the ClBackend.
+ *  Note: any memory allocated must be host accessible with write access to allow for weights and biases
+ *  to be passed in. Read access is not required.. */
+class SampleClBackendCustomAllocator : public armnn::ICustomAllocator
+{
+public:
+    SampleClBackendCustomAllocator() = default;
+
+    void* allocate(size_t size, size_t alignment)
+    {
+        // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment
+        if (alignment == 0)
+        {
+            alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+        }
+        size_t space = size + alignment + alignment;
+        auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
+
+        if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
+        {
+            throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed");
+        }
+        return allocatedMemPtr;
+    }
+
+    /** Interface to be implemented by the child class to free the allocated tensor */
+    void free(void* ptr)
+    {
+        std::free(ptr);
+    }
+
+    armnn::MemorySource GetMemorySourceType()
+    {
+        return armnn::MemorySource::Malloc;
+    }
+};
+
+TEST_SUITE("ClCustomAllocatorTests")
+{
+
+// This is a copy of the SimpleSample app modified to use a custom
+// allocator for the clbackend. It creates a FullyConnected network with a single layer
+// taking a single number as an input
+TEST_CASE("ClCustomAllocatorTest")
+{
+    using namespace armnn;
+
+    float number = 3;
+
+    // Construct ArmNN network
+    armnn::NetworkId networkIdentifier;
+    INetworkPtr myNetwork = INetwork::Create();
+
+    armnn::FullyConnectedDescriptor fullyConnectedDesc;
+    float weightsData[] = {1.0f}; // Identity
+    TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32);
+    weightsInfo.SetConstant(true);
+    armnn::ConstTensor weights(weightsInfo, weightsData);
+
+    ARMNN_NO_DEPRECATE_WARN_BEGIN
+    IConnectableLayer* fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc,
+                                                                          weights,
+                                                                          EmptyOptional(),
+                                                                          "fully connected");
+    ARMNN_NO_DEPRECATE_WARN_END
+    IConnectableLayer* InputLayer = myNetwork->AddInputLayer(0);
+    IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0);
+    InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0));
+    fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
+
+
+    // Create ArmNN runtime
+    IRuntime::CreationOptions options; // default options
+    auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
+    IRuntimePtr run = IRuntime::Create(options);
+
+    //Set the tensors in the network.
+    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
+
+    TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+    // Optimise ArmNN network
+    OptimizerOptions optOptions;
+    optOptions.m_ImportEnabled = true;
+    armnn::IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {"GpuAcc"}, run->GetDeviceSpec(), optOptions);
+    CHECK(optNet);
+
+    // Load graph into runtime
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
+    run->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    const size_t alignment =
+            arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+
+    // Input with negative values
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+
+    void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+    auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
+    std::fill_n(outputPtr, numElements, -10.0f);
+
+    armnn::InputTensors inputTensors
+    {
+        {0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), alignedInputPtr)},
+    };
+    armnn::OutputTensors outputTensors
+    {
+        {0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
+    };
+
+    // Execute network
+    run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
+    run->UnloadNetwork(networkIdentifier);
+
+
+    // Tell the CLBackend to sync memory so we can read the output.
+    arm_compute::CLScheduler::get().sync();
+    auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+
+    run->UnloadNetwork(networkIdentifier);
+    CHECK(outputResult[0] == number);
+    auto& backendRegistry = armnn::BackendRegistryInstance();
+    backendRegistry.DeregisterAllocator(ClBackend::GetIdStatic());
+}
+
+} // test suite ClCustomAllocatorTests
\ No newline at end of file
diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp
index 931729a736..6b1d3521d5 100644
--- a/src/backends/cl/test/ClImportTensorHandleTests.cpp
+++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp
@@ -61,7 +61,7 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport")
     // Validate result by checking that the output has no negative values
     for(unsigned int i = 0; i < numElements; ++i)
     {
-        CHECK(typedPtr[i] >= 0);
+        CHECK(typedPtr[i] == 0);
     }
 }
 
-- 
cgit v1.2.1