1 files changed, 251 insertions, 20 deletions
diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp
index 803c6a4c66..6d886a12b1 100644
--- a/src/backends/gpuFsa/GpuFsaBackend.hpp
+++ b/src/backends/gpuFsa/GpuFsaBackend.hpp
@@ -1,56 +1,287 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
 
 #include <armnn/backends/IBackendInternal.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+
+// System includes for mapping and unmapping memory
+#include <sys/mman.h>
 
 namespace armnn
 {
 
+// add new capabilities here..
+const BackendCapabilities gpuFsaCapabilities("GpuFsa",
+                                             {
+                                                     {"NonConstWeights", false},
+                                                     {"AsyncExecution", false},
+                                                     {"ProtectedContentAllocation", true},
+                                                     {"ConstantTensorsAsInputs", true},
+                                                     {"PreImportIOTensors", false},
+                                                     {"ExternallyManagedMemory", true},
+                                                     {"MultiAxisPacking", false},
+                                                     {"SingleAxisPacking", true}
+                                             });
+
 class GpuFsaBackend : public IBackendInternal
 {
 public:
-    GpuFsaBackend() = default;
+    GpuFsaBackend() : m_CustomAllocator(nullptr) {};
+    GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)
+    {
+        std::string err;
+        UseCustomMemoryAllocator(allocator, err);
+    }
     ~GpuFsaBackend() = default;
 
     static const BackendId& GetIdStatic();
-    const BackendId& GetId() const override
-    {
-        return GetIdStatic();
-    }
+    const BackendId& GetId() const override { return GetIdStatic(); }
 
     IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;
 
     IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
-            const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
+        const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
 
-    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
-            class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const override;
+    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;
 
-    IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+    IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
+                                              const ModelOptions& modelOptions,
+                                              MemorySourceFlags inputFlags,
+                                              MemorySourceFlags outputFlags) const override;
+
+    std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+
+    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;
 
-    IBackendInternal::IBackendProfilingContextPtr
-    CreateBackendProfilingContext(const IRuntime::CreationOptions& creationOptions,
-                                  IBackendProfilingPtr& backendProfiling) override;
+    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+                                       MemorySourceFlags inputFlags,
+                                       MemorySourceFlags outputFlags) override;
+
+    IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+    IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(
+        const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;
 
     IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;
 
     OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,
                                            const ModelOptions& modelOptions) const override;
 
-    std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+    std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
 
-    void RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry) override;
+    BackendCapabilities GetCapabilities() const override
+    {
+        return gpuFsaCapabilities;
+    };
 
-    std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
+    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+                                          armnn::Optional<std::string&> errMsg) override
+    {
+        IgnoreUnused(errMsg);
+        ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";
+
+        // Set flag to signal the backend to use a custom memory allocator
+        m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));
+        m_UsingCustomAllocator = true;
+        return m_UsingCustomAllocator;
+    }
+
+    // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
+    class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator
+    {
+    public:
+        GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
+        {}
+        // Inherited methods overridden:
+        void* allocate(size_t size, size_t alignment) override
+        {
+            auto alloc = m_CustomAllocator->allocate(size, alignment);
+            return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
+        }
+        void free(void* ptr) override
+        {
+            auto hostMemPtr = m_AllocatedBufferMappings[ptr];
+            clReleaseMemObject(static_cast<cl_mem>(ptr));
+            m_CustomAllocator->free(hostMemPtr);
+        }
+        std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
+        {
+            auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
+            cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
+
+            return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),
+                                                                          hostMemPtr,
+                                                                          m_CustomAllocator->GetMemorySourceType());
+        }
+    private:
+        cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
+        {
+            // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+            auto cachelineAlignment =
+                    arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+            auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
+
+            if (source == MemorySource::Malloc)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                            CL_IMPORT_TYPE_ARM,
+                            CL_IMPORT_TYPE_HOST_ARM,
+                            0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                    "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
+            }
+            else if (source == MemorySource::DmaBuf)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                            CL_IMPORT_TYPE_ARM,
+                            CL_IMPORT_TYPE_DMA_BUF_ARM,
+                            CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,
+                            CL_TRUE,
+                            0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+                         + std::to_string(error));
+            }
+            else if (source == MemorySource::DmaBufProtected)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                                CL_IMPORT_TYPE_ARM,
+                                CL_IMPORT_TYPE_DMA_BUF_ARM,
+                                CL_IMPORT_TYPE_PROTECTED_ARM,
+                                CL_TRUE,
+                                0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+                         + std::to_string(error));
+            }
+            throw armnn::Exception(
+                    "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
+        }
+        std::shared_ptr<ICustomAllocator> m_CustomAllocator;
+        std::map<void*, void*> m_AllocatedBufferMappings;
+    };
+
+    class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
+    {
+    public:
+        // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
+        ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)
+            : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+        {
+            _mem = buffer;
+            m_HostMemPtr = hostMemPtr;
+            m_MemorySource = source;
+        }
+
+        // Inherited methods overridden :
+        void* ptr() override
+        {
+            return nullptr;
+        }
+
+        void* map(cl::CommandQueue &q, bool blocking) override
+        {
+            armnn::IgnoreUnused(q, blocking);
+            if (m_HostMemPtr == nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
+            }
+            if (_mapping != nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");
+            }
+            switch (m_MemorySource)
+            {
+                case armnn::MemorySource::Malloc:
+                    _mapping = m_HostMemPtr;
+                    return _mapping;
+                    break;
+                case armnn::MemorySource::DmaBuf:
+                case armnn::MemorySource::DmaBufProtected:
+                    // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd
+                    _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);
+                    return _mapping;
+                    break;
+                default:
+                    throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");
+                    break;
+            }
+        }
 
-private:
-    // Private members
+        void unmap(cl::CommandQueue &q) override
+        {
+            armnn::IgnoreUnused(q);
+            switch (m_MemorySource)
+            {
+                case armnn::MemorySource::Malloc:
+                    _mapping = nullptr;
+                    break;
+                case armnn::MemorySource::DmaBuf:
+                case armnn::MemorySource::DmaBufProtected:
+                    munmap(_mapping, _size);
+                    _mapping = nullptr;
+                    break;
+                default:
+                    throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");
+                    break;
+            }
+        }
+    private:
+        void* m_HostMemPtr = nullptr;
+        armnn::MemorySource m_MemorySource;
+    };
 
-protected:
-    // Protected members
+    std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;
+    bool m_UsingCustomAllocator = false;
 };
 
 } // namespace armnn