From c1c872f12797ef6fe52c4589113e7efc353e56eb Mon Sep 17 00:00:00 2001 From: Jan Eilers Date: Thu, 22 Jul 2021 13:17:04 +0100 Subject: Adds CustomAllocator interface and Sample App * Updates the runtime options with a CustomAllocatorMap which allows to define a CustomAllocator for specific backends * Change IBackendInternal interface to use a shared pointer to a custom allocator * Update ClBackend.hpp/cpp to use the CustomAllocator * Adds an example application and unit test which uses a CustomAllocator for GpuAcc * Refactor of the interface to use MemorySource instead of the user Mapping cl_mem directly * Modify the BackendRegistry to also hold a registry of CustomAllocators * BackendRegistry Deregister will also deregister any allocators associated with that backend id * set_global_allocator within the BaseMemoryManager so that it always matches the currently used allocator Signed-off-by: Jan Eilers Change-Id: I156d819686021865f4375e6cb7a5c5dec8fee9e8 Signed-off-by: David Monahan --- include/armnn/BackendRegistry.hpp | 5 + include/armnn/IRuntime.hpp | 20 ++- include/armnn/backends/IBackendInternal.hpp | 5 +- include/armnn/backends/ICustomAllocator.hpp | 18 ++- samples/CMakeLists.txt | 5 + samples/CustomMemoryAllocatorSample.cpp | 175 +++++++++++++++++++++ src/armnn/BackendRegistry.cpp | 21 +++ src/armnn/Runtime.cpp | 64 +++++++- src/armnn/test/OptimizerTests.cpp | 133 +--------------- src/backends/aclCommon/BaseMemoryManager.cpp | 2 +- src/backends/aclCommon/BaseMemoryManager.hpp | 8 +- .../backendsCommon/test/CompatibilityTests.cpp | 4 + src/backends/cl/ClBackend.cpp | 62 ++++++-- src/backends/cl/ClBackend.hpp | 128 ++++++++++++++- src/backends/cl/ClImportTensorHandle.hpp | 10 +- src/backends/cl/ClRegistryInitializer.cpp | 8 + src/backends/cl/test/CMakeLists.txt | 1 + src/backends/cl/test/ClCustomAllocatorTests.cpp | 160 +++++++++++++++++++ src/backends/cl/test/ClImportTensorHandleTests.cpp | 2 +- 19 files changed, 658 insertions(+), 173 deletions(-) create mode 100644 samples/CustomMemoryAllocatorSample.cpp create mode 100644 src/backends/cl/test/ClCustomAllocatorTests.cpp diff --git a/include/armnn/BackendRegistry.hpp b/include/armnn/BackendRegistry.hpp index fe6451cde0..c13aa9f8b6 100644 --- a/include/armnn/BackendRegistry.hpp +++ b/include/armnn/BackendRegistry.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,8 @@ public: BackendIdSet GetBackendIds() const; std::string GetBackendIdsAsString() const; void SetProfilingService(armnn::Optional profilingService); + void RegisterAllocator(const BackendId& id, std::shared_ptr alloc); + std::unordered_map> GetAllocators(); BackendRegistry() {} virtual ~BackendRegistry() {} @@ -50,6 +53,7 @@ public: }; void Deregister(const BackendId& id); + void DeregisterAllocator(const BackendId &id); protected: using FactoryStorage = std::unordered_map; @@ -63,6 +67,7 @@ private: FactoryStorage m_Factories; armnn::Optional m_ProfilingService; + std::unordered_map> m_CustomMemoryAllocatorMap; }; BackendRegistry& BackendRegistryInstance(); diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index 8c269dee49..97a9c2889e 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -16,6 +16,7 @@ #include #include +#include namespace armnn { @@ -103,8 +104,8 @@ public: : m_GpuAccTunedParameters(nullptr) , m_EnableGpuProfiling(false) , m_DynamicBackendsPath("") - , m_CustomAllocator(nullptr) , m_ProtectedMode(false) + , m_CustomAllocatorMap() {} /// If set, uses the GpuAcc tuned parameters from the given object when executing GPU workloads. @@ -118,17 +119,22 @@ public: /// Only a single path is allowed for the override std::string m_DynamicBackendsPath; - /// A Custom Allocator used for allocation of working memory in the backends. - /// Set this for when you need to allocate Protected Working Memory, required for ProtectedMode - /// Only supported for GpuAcc - ICustomAllocator* m_CustomAllocator; - /// Setting this flag will allow the user to create the Runtime in protected mode. /// It will run all the inferences on protected memory and will make sure that /// INetworkProperties::m_ImportEnabled set to true with MemorySource::DmaBufProtected option - /// This will use Protected Memory Allocator associated with the backend + /// This requires that the backend supports Protected Memory and has an allocator capable of + /// allocating Protected Memory associated with it. bool m_ProtectedMode; + /// @brief A map to define a custom memory allocator for specific backend Ids. + /// + /// @details A Custom Allocator is used for allocation of working memory in the backends. + /// Set this if you need to take control of how memory is allocated on a backend. Required for + /// Protected Mode in order to correctly allocate Protected Memory + /// + /// @note Only supported for GpuAcc + std::map> m_CustomAllocatorMap; + struct ExternalProfilingOptions { ExternalProfilingOptions() diff --git a/include/armnn/backends/IBackendInternal.hpp b/include/armnn/backends/IBackendInternal.hpp index 3b4ef95703..626746465f 100644 --- a/include/armnn/backends/IBackendInternal.hpp +++ b/include/armnn/backends/IBackendInternal.hpp @@ -199,10 +199,13 @@ public: /// Signals the backend to use a custom memory allocator provided by the user /// + /// \param allocator - a pointer to the provided ICustomAllocator to use with this backend /// \param errMsg - Optional string variable to return error messages /// \return - Returns true if switching to custom allocator was successful - virtual bool UseCustomMemoryAllocator(armnn::Optional errMsg) + virtual bool UseCustomMemoryAllocator(std::shared_ptr allocator, + armnn::Optional errMsg) { + IgnoreUnused(allocator); if (errMsg) { std::stringstream message; diff --git a/include/armnn/backends/ICustomAllocator.hpp b/include/armnn/backends/ICustomAllocator.hpp index 1d4df0cb86..92cbcc2641 100644 --- a/include/armnn/backends/ICustomAllocator.hpp +++ b/include/armnn/backends/ICustomAllocator.hpp @@ -7,6 +7,7 @@ #include #include +#include namespace armnn { @@ -23,13 +24,20 @@ public: * @param[in] alignment Alignment that the returned pointer should comply with * * @return A pointer to the allocated memory + * The returned pointer must be host write accessible */ - virtual void *allocate(size_t size, size_t alignment) = 0; - /** Interface to be implemented by the child class to free the allocated tensor */ - virtual void free(void *ptr) = 0; + virtual void* allocate(size_t size, size_t alignment) = 0; - // Utility Function to define the Custom Memory Allocators capabilities - virtual bool SupportsProtectedMemory() = 0; + /** Interface to be implemented by the child class to free the allocated bytes */ + virtual void free(void* ptr) = 0; + + // Used to specify what type of memory is being allocated by this allocator. + // Supported types are: + // MemorySource::Malloc + // Unsupported types are: + // MemorySource::DmaBuf + // MemorySource::DmaBufProtected + virtual armnn::MemorySource GetMemorySourceType() = 0; }; } // namespace armnn \ No newline at end of file diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index ff45eecbe0..7be6a69369 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -8,3 +8,8 @@ if(BUILD_SAMPLE_APP AND SAMPLE_DYNAMIC_BACKEND) target_link_libraries(DynamicSample armnn ${CMAKE_THREAD_LIBS_INIT}) endif() +if(BUILD_SAMPLE_APP AND ARMCOMPUTECL) + add_executable(CustomMemoryAllocatorSample CustomMemoryAllocatorSample.cpp) + target_link_libraries(CustomMemoryAllocatorSample armnn ${CMAKE_THREAD_LIBS_INIT}) +endif() + diff --git a/samples/CustomMemoryAllocatorSample.cpp b/samples/CustomMemoryAllocatorSample.cpp new file mode 100644 index 0000000000..51b3c81079 --- /dev/null +++ b/samples/CustomMemoryAllocatorSample.cpp @@ -0,0 +1,175 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include + +#include +#include + +#include + +/** Sample implementation of ICustomAllocator for use with the ClBackend. + * Note: any memory allocated must be host addressable with write access + * in order for ArmNN to be able to properly use it. */ +class SampleClBackendCustomAllocator : public armnn::ICustomAllocator +{ +public: + SampleClBackendCustomAllocator() = default; + + void* allocate(size_t size, size_t alignment) + { + // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment + if (alignment == 0) + { + alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo(); + } + size_t space = size + alignment + alignment; + auto allocatedMemPtr = std::malloc(space * sizeof(size_t)); + + if (std::align(alignment, size, allocatedMemPtr, space) == nullptr) + { + throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed"); + } + return allocatedMemPtr; + } + + void free(void* ptr) + { + std::free(ptr); + } + + armnn::MemorySource GetMemorySourceType() + { + return armnn::MemorySource::Malloc; + } +}; + + +// A simple example application to show the usage of a custom memory allocator. In this sample, the users single +// input number is multiplied by 1.0f using a fully connected layer with a single neuron to produce an output +// number that is the same as the input. All memory required to execute this mini network is allocated with +// the provided custom allocator. +// +// Using a Custom Allocator is required for use with Protected Mode and Protected Memory. +// This example is provided using only unprotected malloc as Protected Memory is platform +// and implementation specific. +// +// Note: This example is similar to the SimpleSample application that can also be found in armnn/samples. +// The differences are in the use of a custom allocator, the backend is GpuAcc, and the inputs/outputs +// are being imported instead of copied. (Import must be enabled when using a Custom Allocator) +// You might find this useful for comparison. +int main() +{ + using namespace armnn; + + float number; + std::cout << "Please enter a number: " << std::endl; + std::cin >> number; + + // Turn on logging to standard output + // This is useful in this sample so that users can learn more about what is going on + armnn::ConfigureLogging(true, false, LogSeverity::Info); + + // Construct ArmNN network + armnn::NetworkId networkIdentifier; + INetworkPtr myNetwork = INetwork::Create(); + armnn::FullyConnectedDescriptor fullyConnectedDesc; + float weightsData[] = {1.0f}; // Identity + TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32); + weightsInfo.SetConstant(true); + armnn::ConstTensor weights(weightsInfo, weightsData); + ARMNN_NO_DEPRECATE_WARN_BEGIN + IConnectableLayer *fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc, + weights, + EmptyOptional(), + "fully connected"); + ARMNN_NO_DEPRECATE_WARN_END + IConnectableLayer *InputLayer = myNetwork->AddInputLayer(0); + IConnectableLayer *OutputLayer = myNetwork->AddOutputLayer(0); + InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0)); + fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0)); + + // Create ArmNN runtime: + // + // This is the interesting bit when executing a model with a custom allocator. + // You can have different allocators for different backends. To support this + // the runtime creation option has a map that takes a BackendId and the corresponding + // allocator that should be used for that backend. + // Only GpuAcc supports a Custom Allocator for now + // + // Note: This is not covered in this example but if you want to run a model on + // protected memory a custom allocator needs to be provided that supports + // protected memory allocations and the MemorySource of that allocator is + // set to MemorySource::DmaBufProtected + IRuntime::CreationOptions options; + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}}; + IRuntimePtr runtime = IRuntime::Create(options); + + //Set the tensors in the network. + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); + + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32); + fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + + // Optimise ArmNN network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + armnn::IOptimizedNetworkPtr optNet = + Optimize(*myNetwork, {"GpuAcc"}, runtime->GetDeviceSpec(), optOptions); + if (!optNet) + { + // This shouldn't happen for this simple sample, with GpuAcc backend. + // But in general usage Optimize could fail if the backend at runtime cannot + // support the model that has been provided. + std::cerr << "Error: Failed to optimise the input network." << std::endl; + return 1; + } + + // Load graph into runtime + std::string ignoredErrorMessage; + INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); + runtime->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + const size_t alignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment); + + // Input with negative values + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + + void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment); + auto* outputPtr = reinterpret_cast(alignedOutputPtr); + std::fill_n(outputPtr, numElements, -10.0f); + + + armnn::InputTensors inputTensors + { + {0, armnn::ConstTensor(runtime->GetInputTensorInfo(networkIdentifier, 0), alignedInputPtr)}, + }; + armnn::OutputTensors outputTensors + { + {0, armnn::Tensor(runtime->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)} + }; + + // Execute network + runtime->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors); + + // Tell the CLBackend to sync memory so we can read the output. + arm_compute::CLScheduler::get().sync(); + auto* outputResult = reinterpret_cast(alignedOutputPtr); + std::cout << "Your number was " << outputResult[0] << std::endl; + runtime->UnloadNetwork(networkIdentifier); + return 0; + +} diff --git a/src/armnn/BackendRegistry.cpp b/src/armnn/BackendRegistry.cpp index ff63c8236a..80daed9896 100644 --- a/src/armnn/BackendRegistry.cpp +++ b/src/armnn/BackendRegistry.cpp @@ -39,6 +39,7 @@ void BackendRegistry::Register(const BackendId& id, BackendRegistry::FactoryFunc void BackendRegistry::Deregister(const BackendId& id) { m_Factories.erase(id); + DeregisterAllocator(id); if (m_ProfilingService.has_value() && m_ProfilingService.value().IsProfilingEnabled()) { @@ -106,5 +107,25 @@ void BackendRegistry::SetProfilingService(armnn::Optional alloc) +{ + if (m_CustomMemoryAllocatorMap.find(id) != m_CustomMemoryAllocatorMap.end()) + { + throw InvalidArgumentException( + std::string(id) + " already has an allocator associated with it", + CHECK_LOCATION()); + } + m_CustomMemoryAllocatorMap[id] = alloc; +} + +void BackendRegistry::DeregisterAllocator(const BackendId& id) +{ + m_CustomMemoryAllocatorMap.erase(id); +} + +std::unordered_map> BackendRegistry::GetAllocators() +{ + return m_CustomMemoryAllocatorMap; +} } // namespace armnn diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index c2b748653d..9fe58287c3 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -130,7 +130,8 @@ Status RuntimeImpl::LoadNetwork(NetworkId& networkIdOut, IOptimizedNetworkPtr inNetwork, std::string& errorMessage) { - INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); + INetworkProperties networkProperties( + false, MemorySource::Undefined, MemorySource::Undefined); return LoadNetwork(networkIdOut, std::move(inNetwork), errorMessage, networkProperties); } @@ -267,7 +268,8 @@ RuntimeImpl::RuntimeImpl(const IRuntime::CreationOptions& options) if ( options.m_ProfilingOptions.m_TimelineEnabled && !options.m_ProfilingOptions.m_EnableProfiling ) { - throw RuntimeException("It is not possible to enable timeline reporting without profiling being enabled"); + throw RuntimeException( + "It is not possible to enable timeline reporting without profiling being enabled"); } // Load any available/compatible dynamic backend before the runtime @@ -283,6 +285,8 @@ RuntimeImpl::RuntimeImpl(const IRuntime::CreationOptions& options) auto backend = factoryFun(); ARMNN_ASSERT(backend.get() != nullptr); + auto customAllocatorMapIterator = options.m_CustomAllocatorMap.find(id); + // If the runtime is created in protected mode only add backends that support this mode if (options.m_ProtectedMode) { @@ -298,17 +302,61 @@ RuntimeImpl::RuntimeImpl(const IRuntime::CreationOptions& options) << " is not registered as does not support protected content allocation \n"; continue; } - std::string err; - if (!backend->UseCustomMemoryAllocator(err)) + // The user is responsible to provide a custom memory allocator which allows to allocate + // protected memory + if (customAllocatorMapIterator != options.m_CustomAllocatorMap.end()) { - ARMNN_LOG(error) << "The backend " + std::string err; + if (customAllocatorMapIterator->second->GetMemorySourceType() + == armnn::MemorySource::DmaBufProtected) + { + if (!backend->UseCustomMemoryAllocator(customAllocatorMapIterator->second, err)) + { + ARMNN_LOG(error) << "The backend " + << id + << " reported an error when entering protected mode. Backend won't be" + << " used. ErrorMsg: " << err; + continue; + } + // No errors so register the Custom Allocator with the BackendRegistry + BackendRegistryInstance().RegisterAllocator(id, customAllocatorMapIterator->second); + } + else + { + ARMNN_LOG(error) << "The CustomAllocator provided with the runtime options doesn't support " + "protected memory. Protected mode can't be activated. The backend " << id - << " reported an error when entering protected mode. Backend won't be used." - << " ErrorMsg: " << err; + << " is not going to be used. MemorySource must be MemorySource::DmaBufProtected"; + continue; + } + } + else + { + ARMNN_LOG(error) << "Protected mode can't be activated for backend: " + << id + << " no custom allocator was provided to the runtime options."; continue; } } - + else + { + // If a custom memory allocator is provided make the backend use that instead of the default + if (customAllocatorMapIterator != options.m_CustomAllocatorMap.end()) + { + std::string err; + if (!backend->UseCustomMemoryAllocator(customAllocatorMapIterator->second, err)) + { + ARMNN_LOG(error) << "The backend " + << id + << " reported an error when trying to use the provided custom allocator." + " Backend won't be used." + << " ErrorMsg: " << err; + continue; + } + // No errors so register the Custom Allocator with the BackendRegistry + BackendRegistryInstance().RegisterAllocator(id, customAllocatorMapIterator->second); + } + } auto context = backend->CreateBackendContext(options); // backends are allowed to return nullptrs if they diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp index 19bd58193a..38aef671d2 100644 --- a/src/armnn/test/OptimizerTests.cpp +++ b/src/armnn/test/OptimizerTests.cpp @@ -220,9 +220,10 @@ public: return m_BackendCapabilities; }; - virtual bool UseCustomMemoryAllocator(armnn::Optional errMsg) override + virtual bool UseCustomMemoryAllocator(std::shared_ptr allocator, + armnn::Optional errMsg) override { - IgnoreUnused(errMsg); + IgnoreUnused(errMsg, allocator); m_CustomAllocator = true; return m_CustomAllocator; } @@ -925,131 +926,3 @@ TEST_CASE("OptimizeForExclusiveConnectionsWithoutFuseTest") &IsLayerOfType)); } } // Optimizer TestSuite - -TEST_SUITE("Runtime") -{ -// This test really belongs into RuntimeTests.cpp but it requires all sort of MockBackends which are -// already defined here -TEST_CASE("RuntimeProtectedModeOption") -{ - using namespace armnn; - - struct MockPolicy - { - static const BackendId& GetIdStatic() - { - static BackendId id = "MockBackend"; - return id; - } - }; - - struct ProtectedPolicy - { - static const BackendId& GetIdStatic() - { - static BackendId id = "MockBackendProtectedContent"; - return id; - } - }; - - struct SillyPolicy - { - static const BackendId& GetIdStatic() - { - static BackendId id = "SillyMockBackend"; - return id; - } - }; - - BackendCapabilities mockBackendCapabilities("MockBackend", - { - {"ProtectedContentAllocation", false} - }); - BackendCapabilities mockProtectedBackendCapabilities("MockBackendProtectedContent", - { - {"ProtectedContentAllocation", true} - }); - - auto& backendRegistry = BackendRegistryInstance(); - - // clean up from previous test runs - std::vector mockBackends = {"MockBackend", "MockBackendProtectedContent", "SillyMockBackend"}; - for (auto& backend : mockBackends) - { - backendRegistry.Deregister(backend); - } - - // Create a bunch of MockBackends with different capabilities - // 1. Doesn't support protected mode even though it knows about this capability - backendRegistry.Register("MockBackend", [mockBackendCapabilities]() - { - return std::make_unique>(mockBackendCapabilities); - }); - // 2. Supports protected mode and has it implemented correctly - backendRegistry.Register("MockBackendProtectedContent", [mockProtectedBackendCapabilities]() - { - return std::make_unique>(mockProtectedBackendCapabilities); - }); - // 3. Claims to support protected mode but doesn't have the UseCustomMemoryAllocator function implemented - backendRegistry.Register("SillyMockBackend", [mockProtectedBackendCapabilities]() - { - return std::make_unique>(mockProtectedBackendCapabilities); - }); - - // Creates a runtime that is not in protected mode - { - IRuntime::CreationOptions creationOptions; - creationOptions.m_ProtectedMode = false; - - IRuntimePtr run = IRuntime::Create(creationOptions); - - const armnn::BackendIdSet supportedDevices = run->GetDeviceSpec().GetSupportedBackends(); - // Both MockBackends that are registered should show up in the runtimes supported backends list - for (auto& backend : mockBackends) - { - CHECK(std::find(supportedDevices.cbegin(), supportedDevices.cend(), backend) != supportedDevices.cend()); - } - } - - // If the runtime is in protected mode only backends that support protected content should be added - { - IRuntime::CreationOptions creationOptions; - creationOptions.m_ProtectedMode = true; - - IRuntimePtr run = IRuntime::Create(creationOptions); - - const armnn::BackendIdSet supportedDevices = run->GetDeviceSpec().GetSupportedBackends(); - // Only the MockBackends that claims support for protected content should show up in the - // runtimes supported backends list - CHECK(std::find(supportedDevices.cbegin(), - supportedDevices.cend(), - "MockBackendProtectedContent") != supportedDevices.cend()); - CHECK(std::find(supportedDevices.cbegin(), - supportedDevices.cend(), - "MockBackend") == supportedDevices.cend()); - CHECK(std::find(supportedDevices.cbegin(), - supportedDevices.cend(), - "SillyMockBackend") == supportedDevices.cend()); - } - - // If the runtime is in protected mode only backends that support protected content should be added - { - IRuntime::CreationOptions creationOptions; - creationOptions.m_ProtectedMode = true; - - IRuntimePtr run = IRuntime::Create(creationOptions); - - const armnn::BackendIdSet supportedDevices = run->GetDeviceSpec().GetSupportedBackends(); - // Only the MockBackend that claims support for protected content should show up in the - // runtimes supported backends list - CHECK(std::find(supportedDevices.cbegin(), - supportedDevices.cend(), - "MockBackendProtectedContent") != supportedDevices.cend()); - - CHECK(std::find(supportedDevices.cbegin(), - supportedDevices.cend(), - "MockBackend") == supportedDevices.cend()); - } - -} -} diff --git a/src/backends/aclCommon/BaseMemoryManager.cpp b/src/backends/aclCommon/BaseMemoryManager.cpp index 45e0480a84..c60a4a04ae 100644 --- a/src/backends/aclCommon/BaseMemoryManager.cpp +++ b/src/backends/aclCommon/BaseMemoryManager.cpp @@ -15,7 +15,7 @@ namespace armnn { #if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) -BaseMemoryManager::BaseMemoryManager(std::unique_ptr alloc, +BaseMemoryManager::BaseMemoryManager(std::shared_ptr alloc, MemoryAffinity memoryAffinity) { ARMNN_ASSERT(alloc); diff --git a/src/backends/aclCommon/BaseMemoryManager.hpp b/src/backends/aclCommon/BaseMemoryManager.hpp index e80abf0edd..e3ffd188a1 100644 --- a/src/backends/aclCommon/BaseMemoryManager.hpp +++ b/src/backends/aclCommon/BaseMemoryManager.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #endif namespace armnn @@ -36,14 +37,14 @@ public: void Release() override; #if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) - BaseMemoryManager(std::unique_ptr alloc, MemoryAffinity memoryAffinity); + BaseMemoryManager(std::shared_ptr alloc, MemoryAffinity memoryAffinity); std::shared_ptr& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; } std::shared_ptr& GetInterLayerManager() { return m_InterLayerMemoryMgr; } std::shared_ptr& GetInterLayerMemoryGroup() { return m_InterLayerMemoryGroup; } protected: - std::unique_ptr m_Allocator; + std::shared_ptr m_Allocator; std::shared_ptr m_IntraLayerMemoryMgr; std::shared_ptr m_InterLayerMemoryMgr; std::shared_ptr m_InterLayerMemoryGroup; @@ -81,9 +82,10 @@ public: ClMemoryManager() {} virtual ~ClMemoryManager() {} - ClMemoryManager(std::unique_ptr alloc) + ClMemoryManager(std::shared_ptr alloc) : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer) { + arm_compute::CLTensorAllocator::set_global_allocator(alloc.get()); m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr); } diff --git a/src/backends/backendsCommon/test/CompatibilityTests.cpp b/src/backends/backendsCommon/test/CompatibilityTests.cpp index 12cb5e9956..34baad9d0c 100644 --- a/src/backends/backendsCommon/test/CompatibilityTests.cpp +++ b/src/backends/backendsCommon/test/CompatibilityTests.cpp @@ -3,8 +3,12 @@ // SPDX-License-Identifier: MIT // +#if defined(ARMCOMPUTECL_ENABLED) #include +#endif +#if defined(ARMCOMPUTENEON_ENABLED) #include +#endif #include #include diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index f1e52c1998..b85232e75c 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -49,6 +49,10 @@ const BackendId& ClBackend::GetIdStatic() IBackendInternal::IMemoryManagerUniquePtr ClBackend::CreateMemoryManager() const { + if (m_UsingCustomAllocator) + { + return std::make_unique(m_CustomAllocator); + } return std::make_unique(std::make_unique()); } @@ -69,7 +73,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry) const { - auto memoryManager = std::make_shared(std::make_unique()); + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique(memoryManager)); @@ -83,7 +95,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( TensorHandleFactoryRegistry& registry, const ModelOptions& modelOptions) const { - auto memoryManager = std::make_shared(std::make_unique()); + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique(memoryManager)); @@ -100,7 +120,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) const { - auto memoryManager = std::make_shared(std::make_unique()); + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique(memoryManager)); @@ -118,10 +146,18 @@ std::vector ClBackend::GetHandleFactoryPreferen void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) { - auto mgr = std::make_shared(std::make_unique()); + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } - registry.RegisterMemoryManager(mgr); - registry.RegisterFactory(std::make_unique(mgr)); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::make_unique(memoryManager)); registry.RegisterFactory(std::make_unique( static_cast(MemorySource::Malloc), static_cast(MemorySource::Malloc))); } @@ -130,10 +166,18 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) { - auto mgr = std::make_shared(std::make_unique()); + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } - registry.RegisterMemoryManager(mgr); - registry.RegisterFactory(std::make_unique(mgr)); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::make_unique(memoryManager)); registry.RegisterFactory(std::make_unique(inputFlags, outputFlags)); } diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp index c742c0b204..c63bd25c56 100644 --- a/src/backends/cl/ClBackend.hpp +++ b/src/backends/cl/ClBackend.hpp @@ -6,6 +6,15 @@ #include +#include +#include + +#include +#include + +#include +#include + namespace armnn { @@ -20,7 +29,12 @@ const BackendCapabilities gpuAccCapabilities("GpuAcc", class ClBackend : public IBackendInternal { public: - ClBackend() : m_EnableCustomAllocator(false) {}; + ClBackend() : m_CustomAllocator(nullptr) {}; + ClBackend(std::shared_ptr allocator) + { + std::string err; + UseCustomMemoryAllocator(allocator, err); + } ~ClBackend() = default; static const BackendId& GetIdStatic(); @@ -72,17 +86,119 @@ public: return gpuAccCapabilities; }; - virtual bool UseCustomMemoryAllocator(armnn::Optional errMsg) override + virtual bool UseCustomMemoryAllocator(std::shared_ptr allocator, + armnn::Optional errMsg) override { IgnoreUnused(errMsg); + ARMNN_LOG(info) << "Using Custom Allocator for ClBackend"; // Set flag to signal the backend to use a custom memory allocator - m_EnableCustomAllocator = true; - - return m_EnableCustomAllocator; + m_CustomAllocator = std::make_shared(std::move(allocator)); + m_UsingCustomAllocator = true; + return m_UsingCustomAllocator; } - bool m_EnableCustomAllocator; + // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this + class ClBackendCustomAllocatorWrapper : public arm_compute::IAllocator + { + public: + ClBackendCustomAllocatorWrapper(std::shared_ptr alloc) : m_CustomAllocator(alloc) + {} + // Inherited methods overridden: + void* allocate(size_t size, size_t alignment) override + { + auto alloc = m_CustomAllocator->allocate(size, alignment); + return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType()); + } + void free(void* ptr) override + { + auto hostMemPtr = m_AllocatedBufferMappings[ptr]; + clReleaseMemObject(static_cast(ptr)); + m_CustomAllocator->free(hostMemPtr); + } + std::unique_ptr make_region(size_t size, size_t alignment) override + { + auto hostMemPtr = m_CustomAllocator->allocate(size, alignment); + cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType()); + + return std::make_unique(cl::Buffer(buffer), hostMemPtr); + } + private: + cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source) + { + // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + auto cachelineAlignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment); + + if (source == MemorySource::Malloc) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_HOST_ARM, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error)); + } + throw armnn::Exception( + "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator"); + } + std::shared_ptr m_CustomAllocator; + std::map m_AllocatedBufferMappings; + }; + + class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion + { + public: + // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access + ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr) + : ICLMemoryRegion(buffer.getInfo()) + { + _mem = buffer; + m_HostMemPtr = hostMemPtr; + } + + // Inherited methods overridden : + void* ptr() override + { + return nullptr; + } + + void* map(cl::CommandQueue &q, bool blocking) override + { + armnn::IgnoreUnused(q, blocking); + if (m_HostMemPtr == nullptr) + { + throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr"); + } + _mapping = m_HostMemPtr; + return _mapping; + } + + void unmap(cl::CommandQueue &q) override + { + armnn::IgnoreUnused(q); + _mapping = nullptr; + } + void* m_HostMemPtr = nullptr; + }; + + std::shared_ptr m_CustomAllocator; + bool m_UsingCustomAllocator = false; }; } // namespace armnn diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp index 3fca7cb127..69cd4a6d81 100644 --- a/src/backends/cl/ClImportTensorHandle.hpp +++ b/src/backends/cl/ClImportTensorHandle.hpp @@ -140,10 +140,16 @@ public: private: bool ClImport(const cl_import_properties_arm* importProperties, void* memory) { - const size_t totalBytes = m_Tensor.info()->total_size(); + size_t totalBytes = m_Tensor.info()->total_size(); + + // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + auto cachelineAlignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + cl_int error = CL_SUCCESS; cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), - CL_MEM_READ_WRITE, importProperties, memory, totalBytes, &error); + CL_MEM_READ_WRITE, importProperties, memory, roundedSize, &error); if (error != CL_SUCCESS) { throw MemoryImportException("ClImportTensorHandle::Invalid imported memory" + std::to_string(error)); diff --git a/src/backends/cl/ClRegistryInitializer.cpp b/src/backends/cl/ClRegistryInitializer.cpp index 8decd6f689..aadc14bd68 100644 --- a/src/backends/cl/ClRegistryInitializer.cpp +++ b/src/backends/cl/ClRegistryInitializer.cpp @@ -18,6 +18,14 @@ static BackendRegistry::StaticRegistryInitializer g_RegisterHelper ClBackend::GetIdStatic(), []() { + // Check if we have a CustomMemoryAllocator associated with the backend + // and if so register it with the backend. + auto customAllocators = BackendRegistryInstance().GetAllocators(); + auto allocatorIterator = customAllocators.find(ClBackend::GetIdStatic()); + if (allocatorIterator != customAllocators.end()) + { + return IBackendInternalUniquePtr(new ClBackend(allocatorIterator->second)); + } return IBackendInternalUniquePtr(new ClBackend); } }; diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt index 6662a1e659..41cbe24c15 100644 --- a/src/backends/cl/test/CMakeLists.txt +++ b/src/backends/cl/test/CMakeLists.txt @@ -6,6 +6,7 @@ list(APPEND armnnClBackendUnitTests_sources ClContextControlFixture.hpp ClContextSerializerTests.cpp + ClCustomAllocatorTests.cpp ClCreateWorkloadTests.cpp ClEndToEndTests.cpp ClImportTensorHandleFactoryTests.cpp diff --git a/src/backends/cl/test/ClCustomAllocatorTests.cpp b/src/backends/cl/test/ClCustomAllocatorTests.cpp new file mode 100644 index 0000000000..4d1a0e1cfb --- /dev/null +++ b/src/backends/cl/test/ClCustomAllocatorTests.cpp @@ -0,0 +1,160 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// Contains the OpenCl interfaces for mapping memory in the Gpu Page Tables +// Requires the OpenCl backend to be included (GpuAcc) +#include +#include +#include + + +/** Sample implementation of ICustomAllocator for use with the ClBackend. + * Note: any memory allocated must be host accessible with write access to allow for weights and biases + * to be passed in. Read access is not required.. */ +class SampleClBackendCustomAllocator : public armnn::ICustomAllocator +{ +public: + SampleClBackendCustomAllocator() = default; + + void* allocate(size_t size, size_t alignment) + { + // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment + if (alignment == 0) + { + alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo(); + } + size_t space = size + alignment + alignment; + auto allocatedMemPtr = std::malloc(space * sizeof(size_t)); + + if (std::align(alignment, size, allocatedMemPtr, space) == nullptr) + { + throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed"); + } + return allocatedMemPtr; + } + + /** Interface to be implemented by the child class to free the allocated tensor */ + void free(void* ptr) + { + std::free(ptr); + } + + armnn::MemorySource GetMemorySourceType() + { + return armnn::MemorySource::Malloc; + } +}; + +TEST_SUITE("ClCustomAllocatorTests") +{ + +// This is a copy of the SimpleSample app modified to use a custom +// allocator for the clbackend. It creates a FullyConnected network with a single layer +// taking a single number as an input +TEST_CASE("ClCustomAllocatorTest") +{ + using namespace armnn; + + float number = 3; + + // Construct ArmNN network + armnn::NetworkId networkIdentifier; + INetworkPtr myNetwork = INetwork::Create(); + + armnn::FullyConnectedDescriptor fullyConnectedDesc; + float weightsData[] = {1.0f}; // Identity + TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32); + weightsInfo.SetConstant(true); + armnn::ConstTensor weights(weightsInfo, weightsData); + + ARMNN_NO_DEPRECATE_WARN_BEGIN + IConnectableLayer* fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc, + weights, + EmptyOptional(), + "fully connected"); + ARMNN_NO_DEPRECATE_WARN_END + IConnectableLayer* InputLayer = myNetwork->AddInputLayer(0); + IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0); + InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0)); + fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0)); + + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + //Set the tensors in the network. + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); + + TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32); + fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + + // Optimise ArmNN network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + armnn::IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {"GpuAcc"}, run->GetDeviceSpec(), optOptions); + CHECK(optNet); + + // Load graph into runtime + std::string ignoredErrorMessage; + INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); + run->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + const size_t alignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment); + + // Input with negative values + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + + void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment); + auto* outputPtr = reinterpret_cast(alignedOutputPtr); + std::fill_n(outputPtr, numElements, -10.0f); + + armnn::InputTensors inputTensors + { + {0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), alignedInputPtr)}, + }; + armnn::OutputTensors outputTensors + { + {0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)} + }; + + // Execute network + run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors); + run->UnloadNetwork(networkIdentifier); + + + // Tell the CLBackend to sync memory so we can read the output. + arm_compute::CLScheduler::get().sync(); + auto* outputResult = reinterpret_cast(alignedOutputPtr); + + run->UnloadNetwork(networkIdentifier); + CHECK(outputResult[0] == number); + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(ClBackend::GetIdStatic()); +} + +} // test suite ClCustomAllocatorTests \ No newline at end of file diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp index 931729a736..6b1d3521d5 100644 --- a/src/backends/cl/test/ClImportTensorHandleTests.cpp +++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp @@ -61,7 +61,7 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport") // Validate result by checking that the output has no negative values for(unsigned int i = 0; i < numElements; ++i) { - CHECK(typedPtr[i] >= 0); + CHECK(typedPtr[i] == 0); } } -- cgit v1.2.1