diff options
Diffstat (limited to 'src/backends/cl')
-rw-r--r-- | src/backends/cl/ClBackend.cpp | 62 | ||||
-rw-r--r-- | src/backends/cl/ClBackend.hpp | 128 | ||||
-rw-r--r-- | src/backends/cl/ClImportTensorHandle.hpp | 10 | ||||
-rw-r--r-- | src/backends/cl/ClRegistryInitializer.cpp | 8 | ||||
-rw-r--r-- | src/backends/cl/test/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/backends/cl/test/ClCustomAllocatorTests.cpp | 160 | ||||
-rw-r--r-- | src/backends/cl/test/ClImportTensorHandleTests.cpp | 2 |
7 files changed, 353 insertions, 18 deletions
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index f1e52c1998..b85232e75c 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -49,6 +49,10 @@ const BackendId& ClBackend::GetIdStatic() IBackendInternal::IMemoryManagerUniquePtr ClBackend::CreateMemoryManager() const { + if (m_UsingCustomAllocator) + { + return std::make_unique<ClMemoryManager>(m_CustomAllocator); + } return std::make_unique<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); } @@ -69,7 +73,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry) const { - auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + std::shared_ptr<ClMemoryManager> memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + } registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager)); @@ -83,7 +95,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry, const ModelOptions& modelOptions) const { - auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + std::shared_ptr<ClMemoryManager> memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + } registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager)); @@ -100,7 +120,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) const { - auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + std::shared_ptr<ClMemoryManager> memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + } registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager)); @@ -118,10 +146,18 @@ std::vector<ITensorHandleFactory::FactoryId> ClBackend::GetHandleFactoryPreferen void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) { - auto mgr = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + std::shared_ptr<ClMemoryManager> memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + } - registry.RegisterMemoryManager(mgr); - registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr)); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager)); registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>( static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc))); } @@ -130,10 +166,18 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) { - auto mgr = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + std::shared_ptr<ClMemoryManager> memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>()); + } - registry.RegisterMemoryManager(mgr); - registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr)); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager)); registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(inputFlags, outputFlags)); } diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp index c742c0b204..c63bd25c56 100644 --- a/src/backends/cl/ClBackend.hpp +++ b/src/backends/cl/ClBackend.hpp @@ -6,6 +6,15 @@ #include <armnn/backends/IBackendInternal.hpp> +#include <arm_compute/core/Types.h> +#include <arm_compute/runtime/CL/CLBufferAllocator.h> + +#include <aclCommon/BaseMemoryManager.hpp> +#include <arm_compute/runtime/CL/CLMemoryRegion.h> + +#include <arm_compute/core/CL/CLKernelLibrary.h> +#include <CL/cl_ext.h> + namespace armnn { @@ -20,7 +29,12 @@ const BackendCapabilities gpuAccCapabilities("GpuAcc", class ClBackend : public IBackendInternal { public: - ClBackend() : m_EnableCustomAllocator(false) {}; + ClBackend() : m_CustomAllocator(nullptr) {}; + ClBackend(std::shared_ptr<ICustomAllocator> allocator) + { + std::string err; + UseCustomMemoryAllocator(allocator, err); + } ~ClBackend() = default; static const BackendId& GetIdStatic(); @@ -72,17 +86,119 @@ public: return gpuAccCapabilities; }; - virtual bool UseCustomMemoryAllocator(armnn::Optional<std::string&> errMsg) override + virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator, + armnn::Optional<std::string&> errMsg) override { IgnoreUnused(errMsg); + ARMNN_LOG(info) << "Using Custom Allocator for ClBackend"; // Set flag to signal the backend to use a custom memory allocator - m_EnableCustomAllocator = true; - - return m_EnableCustomAllocator; + m_CustomAllocator = std::make_shared<ClBackendCustomAllocatorWrapper>(std::move(allocator)); + m_UsingCustomAllocator = true; + return m_UsingCustomAllocator; } - bool m_EnableCustomAllocator; + // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this + class ClBackendCustomAllocatorWrapper : public arm_compute::IAllocator + { + public: + ClBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc) + {} + // Inherited methods overridden: + void* allocate(size_t size, size_t alignment) override + { + auto alloc = m_CustomAllocator->allocate(size, alignment); + return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType()); + } + void free(void* ptr) override + { + auto hostMemPtr = m_AllocatedBufferMappings[ptr]; + clReleaseMemObject(static_cast<cl_mem>(ptr)); + m_CustomAllocator->free(hostMemPtr); + } + std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override + { + auto hostMemPtr = m_CustomAllocator->allocate(size, alignment); + cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType()); + + return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer), hostMemPtr); + } + private: + cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source) + { + // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + auto cachelineAlignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); + auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment); + + if (source == MemorySource::Malloc) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_HOST_ARM, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error)); + } + throw armnn::Exception( + "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator"); + } + std::shared_ptr<ICustomAllocator> m_CustomAllocator; + std::map<void*, void*> m_AllocatedBufferMappings; + }; + + class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion + { + public: + // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access + ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr) + : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>()) + { + _mem = buffer; + m_HostMemPtr = hostMemPtr; + } + + // Inherited methods overridden : + void* ptr() override + { + return nullptr; + } + + void* map(cl::CommandQueue &q, bool blocking) override + { + armnn::IgnoreUnused(q, blocking); + if (m_HostMemPtr == nullptr) + { + throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr"); + } + _mapping = m_HostMemPtr; + return _mapping; + } + + void unmap(cl::CommandQueue &q) override + { + armnn::IgnoreUnused(q); + _mapping = nullptr; + } + void* m_HostMemPtr = nullptr; + }; + + std::shared_ptr<ClBackendCustomAllocatorWrapper> m_CustomAllocator; + bool m_UsingCustomAllocator = false; }; } // namespace armnn diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp index 3fca7cb127..69cd4a6d81 100644 --- a/src/backends/cl/ClImportTensorHandle.hpp +++ b/src/backends/cl/ClImportTensorHandle.hpp @@ -140,10 +140,16 @@ public: private: bool ClImport(const cl_import_properties_arm* importProperties, void* memory) { - const size_t totalBytes = m_Tensor.info()->total_size(); + size_t totalBytes = m_Tensor.info()->total_size(); + + // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + auto cachelineAlignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); + auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + cl_int error = CL_SUCCESS; cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), - CL_MEM_READ_WRITE, importProperties, memory, totalBytes, &error); + CL_MEM_READ_WRITE, importProperties, memory, roundedSize, &error); if (error != CL_SUCCESS) { throw MemoryImportException("ClImportTensorHandle::Invalid imported memory" + std::to_string(error)); diff --git a/src/backends/cl/ClRegistryInitializer.cpp b/src/backends/cl/ClRegistryInitializer.cpp index 8decd6f689..aadc14bd68 100644 --- a/src/backends/cl/ClRegistryInitializer.cpp +++ b/src/backends/cl/ClRegistryInitializer.cpp @@ -18,6 +18,14 @@ static BackendRegistry::StaticRegistryInitializer g_RegisterHelper ClBackend::GetIdStatic(), []() { + // Check if we have a CustomMemoryAllocator associated with the backend + // and if so register it with the backend. + auto customAllocators = BackendRegistryInstance().GetAllocators(); + auto allocatorIterator = customAllocators.find(ClBackend::GetIdStatic()); + if (allocatorIterator != customAllocators.end()) + { + return IBackendInternalUniquePtr(new ClBackend(allocatorIterator->second)); + } return IBackendInternalUniquePtr(new ClBackend); } }; diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt index 6662a1e659..41cbe24c15 100644 --- a/src/backends/cl/test/CMakeLists.txt +++ b/src/backends/cl/test/CMakeLists.txt @@ -6,6 +6,7 @@ list(APPEND armnnClBackendUnitTests_sources ClContextControlFixture.hpp ClContextSerializerTests.cpp + ClCustomAllocatorTests.cpp ClCreateWorkloadTests.cpp ClEndToEndTests.cpp ClImportTensorHandleFactoryTests.cpp diff --git a/src/backends/cl/test/ClCustomAllocatorTests.cpp b/src/backends/cl/test/ClCustomAllocatorTests.cpp new file mode 100644 index 0000000000..4d1a0e1cfb --- /dev/null +++ b/src/backends/cl/test/ClCustomAllocatorTests.cpp @@ -0,0 +1,160 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include <armnn/backends/ICustomAllocator.hpp> +#include <armnn/Descriptors.hpp> +#include <armnn/Exceptions.hpp> +#include <armnn/INetwork.hpp> +#include <armnn/IRuntime.hpp> +#include <armnn/Utils.hpp> +#include <armnn/BackendRegistry.hpp> +#include <cl/ClBackend.hpp> + +#include <doctest/doctest.h> + +// Contains the OpenCl interfaces for mapping memory in the Gpu Page Tables +// Requires the OpenCl backend to be included (GpuAcc) +#include <arm_compute/core/CL/CLKernelLibrary.h> +#include <CL/cl_ext.h> +#include <arm_compute/runtime/CL/CLScheduler.h> + + +/** Sample implementation of ICustomAllocator for use with the ClBackend. + * Note: any memory allocated must be host accessible with write access to allow for weights and biases + * to be passed in. Read access is not required.. */ +class SampleClBackendCustomAllocator : public armnn::ICustomAllocator +{ +public: + SampleClBackendCustomAllocator() = default; + + void* allocate(size_t size, size_t alignment) + { + // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment + if (alignment == 0) + { + alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); + } + size_t space = size + alignment + alignment; + auto allocatedMemPtr = std::malloc(space * sizeof(size_t)); + + if (std::align(alignment, size, allocatedMemPtr, space) == nullptr) + { + throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed"); + } + return allocatedMemPtr; + } + + /** Interface to be implemented by the child class to free the allocated tensor */ + void free(void* ptr) + { + std::free(ptr); + } + + armnn::MemorySource GetMemorySourceType() + { + return armnn::MemorySource::Malloc; + } +}; + +TEST_SUITE("ClCustomAllocatorTests") +{ + +// This is a copy of the SimpleSample app modified to use a custom +// allocator for the clbackend. It creates a FullyConnected network with a single layer +// taking a single number as an input +TEST_CASE("ClCustomAllocatorTest") +{ + using namespace armnn; + + float number = 3; + + // Construct ArmNN network + armnn::NetworkId networkIdentifier; + INetworkPtr myNetwork = INetwork::Create(); + + armnn::FullyConnectedDescriptor fullyConnectedDesc; + float weightsData[] = {1.0f}; // Identity + TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32); + weightsInfo.SetConstant(true); + armnn::ConstTensor weights(weightsInfo, weightsData); + + ARMNN_NO_DEPRECATE_WARN_BEGIN + IConnectableLayer* fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc, + weights, + EmptyOptional(), + "fully connected"); + ARMNN_NO_DEPRECATE_WARN_END + IConnectableLayer* InputLayer = myNetwork->AddInputLayer(0); + IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0); + InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0)); + fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0)); + + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>(); + options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + //Set the tensors in the network. + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); + + TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32); + fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + + // Optimise ArmNN network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + armnn::IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {"GpuAcc"}, run->GetDeviceSpec(), optOptions); + CHECK(optNet); + + // Load graph into runtime + std::string ignoredErrorMessage; + INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); + run->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + const size_t alignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment); + + // Input with negative values + auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + + void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment); + auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr); + std::fill_n(outputPtr, numElements, -10.0f); + + armnn::InputTensors inputTensors + { + {0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), alignedInputPtr)}, + }; + armnn::OutputTensors outputTensors + { + {0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)} + }; + + // Execute network + run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors); + run->UnloadNetwork(networkIdentifier); + + + // Tell the CLBackend to sync memory so we can read the output. + arm_compute::CLScheduler::get().sync(); + auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr); + + run->UnloadNetwork(networkIdentifier); + CHECK(outputResult[0] == number); + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(ClBackend::GetIdStatic()); +} + +} // test suite ClCustomAllocatorTests
\ No newline at end of file diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp index 931729a736..6b1d3521d5 100644 --- a/src/backends/cl/test/ClImportTensorHandleTests.cpp +++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp @@ -61,7 +61,7 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport") // Validate result by checking that the output has no negative values for(unsigned int i = 0; i < numElements; ++i) { - CHECK(typedPtr[i] >= 0); + CHECK(typedPtr[i] == 0); } } |