From 2b32a69f3aac5496d0a966d9740cb4854504f3d9 Mon Sep 17 00:00:00 2001 From: Cathal Corbett Date: Mon, 9 Jan 2023 12:47:48 +0000 Subject: IVGCVSW-7380 Update the GpuFsa Skeleton to build and load ACL * Reuse cl backend to be able to create ClRuntime, ClContexts etc. for the new GpuFsa backend. * Can access code defined in the experimental interface dynamic_fusion. * No BackendModelContext as model/backend options not required for now. * Any of the serializer and deserializer is emitted as context caching not required. * No ImportTensorHandle and ImportTensorHandleFactory for now. * Moved tuning and IClTensorHandle code to aclCommon as it is accessed by both cl and gpuFsa. * Small code refactor of cl backend. * Added DefaultAllocatorTests to GpuFsa backend. Signed-off-by: Cathal Corbett Change-Id: I6ae591360e9d2a783aafd06e2d7bf8e0b3e623ee --- CMakeLists.txt | 2 +- cmake/GlobalConfig.cmake | 19 +- src/armnn/Network.cpp | 14 +- src/backends/aclCommon/BaseMemoryManager.cpp | 14 +- src/backends/aclCommon/BaseMemoryManager.hpp | 34 +- src/backends/aclCommon/common.cmake | 4 +- src/backends/gpuFsa/CMakeLists.txt | 20 +- src/backends/gpuFsa/GpuFsaBackend.cpp | 172 +++++++--- src/backends/gpuFsa/GpuFsaBackend.hpp | 271 ++++++++++++++-- src/backends/gpuFsa/GpuFsaBackendContext.cpp | 230 ++++++++++++++ src/backends/gpuFsa/GpuFsaBackendContext.hpp | 47 +++ .../gpuFsa/GpuFsaBackendDefaultAllocator.hpp | 51 +++ src/backends/gpuFsa/GpuFsaContextControl.cpp | 163 ++++++++++ src/backends/gpuFsa/GpuFsaContextControl.hpp | 42 +++ src/backends/gpuFsa/GpuFsaMemoryManager.cpp | 101 ------ src/backends/gpuFsa/GpuFsaMemoryManager.hpp | 59 ---- src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp | 4 +- src/backends/gpuFsa/GpuFsaTensorHandle.cpp | 176 ----------- src/backends/gpuFsa/GpuFsaTensorHandle.hpp | 350 ++++++++++++++++++--- src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp | 67 ++-- src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp | 7 +- src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp | 58 +--- src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp | 18 +- src/backends/gpuFsa/backend.cmake | 4 +- src/backends/gpuFsa/backend.mk | 22 +- src/backends/gpuFsa/test/CMakeLists.txt | 3 +- .../gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp | 193 ++++++++++++ 27 files changed, 1582 insertions(+), 563 deletions(-) create mode 100644 src/backends/gpuFsa/GpuFsaBackendContext.cpp create mode 100644 src/backends/gpuFsa/GpuFsaBackendContext.hpp create mode 100644 src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp create mode 100644 src/backends/gpuFsa/GpuFsaContextControl.cpp create mode 100644 src/backends/gpuFsa/GpuFsaContextControl.hpp delete mode 100644 src/backends/gpuFsa/GpuFsaMemoryManager.cpp delete mode 100644 src/backends/gpuFsa/GpuFsaMemoryManager.hpp delete mode 100644 src/backends/gpuFsa/GpuFsaTensorHandle.cpp create mode 100644 src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 476e080442..19626f2862 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -502,7 +502,7 @@ endif() install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) -if(ARMCOMPUTENEON OR ARMCOMPUTECL) +if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA) target_link_libraries(armnn PUBLIC ${ARMCOMPUTE_LIBRARIES}) endif() diff --git a/cmake/GlobalConfig.cmake b/cmake/GlobalConfig.cmake index bc9117f702..8a1211246c 100644 --- a/cmake/GlobalConfig.cmake +++ b/cmake/GlobalConfig.cmake @@ -10,7 +10,7 @@ option(BUILD_TESTS "Build test applications" OFF) option(BUILD_FOR_COVERAGE "Use no optimization and output .gcno and .gcda files" OFF) option(ARMCOMPUTENEON "Build with ARM Compute NEON support" OFF) option(ARMCOMPUTECL "Build with ARM Compute OpenCL support" OFF) -option(ARMNNGPUFSA "Build with GPU Dynamic Fusion Backend" OFF) +option(ARMCOMPUTEGPUFSA "Build with GPU Dynamic Fusion Backend" OFF) option(ARMNNREF "Build with ArmNN reference support" ON) option(ARMNNTOSAREF "Build with TOSA reference support" OFF) option(PROFILING_BACKEND_STREAMLINE "Forward the armNN profiling events to DS-5/Streamline as annotations" OFF) @@ -261,7 +261,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/profiling) # ARM Compute # Note that ARM Compute has a different folder layout depending on the branch but also on # whether it comes from a prepackaged archive (this is why we add several hints below) -if(ARMCOMPUTENEON OR ARMCOMPUTECL) +if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA) find_path(ARMCOMPUTE_INCLUDE arm_compute/core/CL/OpenCL.h PATHS ${ARMCOMPUTE_ROOT}/include PATHS ${ARMCOMPUTE_ROOT}/applications/arm_compute @@ -330,7 +330,7 @@ if(ARMCOMPUTENEON) endif() # ARM Compute OpenCL backend -if(ARMCOMPUTECL) +if(ARMCOMPUTECL OR ARMCOMPUTEGPUFSA) # verify we have a valid flatbuffers include path find_path(FLATBUFFERS_INCLUDE_PATH flatbuffers/flatbuffers.h HINTS ${FLATBUFFERS_ROOT}/include /usr/local/include /usr/include) @@ -354,15 +354,22 @@ if(ARMCOMPUTECL) include_directories(SYSTEM ${OPENCL_INCLUDE}) - # Add preprocessor definition for ARM Compute OpenCL - add_definitions(-DARMCOMPUTECL_ENABLED) + if(ARMCOMPUTECL) + # Add preprocessor definition for ARM Compute OpenCL + add_definitions(-DARMCOMPUTECL_ENABLED) + endif() + + if(ARMCOMPUTEGPUFSA) + # Add preprocessor definition for ARM Compute OpenCL + add_definitions(-DARMCOMPUTEGPUFSA_ENABLED) + endif() set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DARM_COMPUTE_DEBUG_ENABLED") endif() # Used by both Arm Compute backends, but should be added # to the search path after the system directories if necessary -if(ARMCOMPUTENEON OR ARMCOMPUTECL) +if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA) find_path(HALF_INCLUDE half/half.hpp) find_path(HALF_INCLUDE half/half.hpp PATHS ${ARMCOMPUTE_ROOT}/include diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index 42388bfbd7..cda87e89c2 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017,2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017,2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -1582,6 +1582,18 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph, ProfilerManager::GetInstance().RegisterProfiler(profiler.get()); profiler->EnableProfiling(options.m_ProfilingEnabled); + // Some backends don't play well together. Check here before continuing. + { + std::set backendSet(backendPreferences.begin(), backendPreferences.end()); + // GpuFsa cannot co-exist with GpuAcc. + if (backendSet.find("GpuFsa") != backendSet.end() && + backendSet.find("GpuAcc") != backendSet.end()) + { + throw InvalidArgumentException("The backends \"GpuAcc\" and \"GpuFsa\" cannot be specified " + "for the same optimized network."); + } + } + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Optimizer"); if (backendPreferences.empty()) { diff --git a/src/backends/aclCommon/BaseMemoryManager.cpp b/src/backends/aclCommon/BaseMemoryManager.cpp index c60a4a04ae..e70d7f851d 100644 --- a/src/backends/aclCommon/BaseMemoryManager.cpp +++ b/src/backends/aclCommon/BaseMemoryManager.cpp @@ -1,10 +1,10 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "BaseMemoryManager.hpp" -#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED) #include "arm_compute/runtime/BlobLifetimeManager.h" #include "arm_compute/runtime/PoolManager.h" #include "arm_compute/runtime/OffsetLifetimeManager.h" @@ -14,7 +14,7 @@ namespace armnn { -#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED) BaseMemoryManager::BaseMemoryManager(std::shared_ptr alloc, MemoryAffinity memoryAffinity) { @@ -104,4 +104,12 @@ ClMemoryManager::CreateMemoryGroup(const std::shared_ptr +GpuFsaMemoryManager::CreateMemoryGroup(const std::shared_ptr& memoryManager) +{ + return std::make_shared(memoryManager); } +#endif + +} \ No newline at end of file diff --git a/src/backends/aclCommon/BaseMemoryManager.hpp b/src/backends/aclCommon/BaseMemoryManager.hpp index af099f900a..c18c4830a0 100644 --- a/src/backends/aclCommon/BaseMemoryManager.hpp +++ b/src/backends/aclCommon/BaseMemoryManager.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -7,17 +7,13 @@ #include #include -#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) -#include -#endif - -#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED) #include #include #include #endif -#if defined(ARMCOMPUTECL_ENABLED) +#if defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED) #include #endif @@ -39,7 +35,7 @@ public: void Acquire() override; void Release() override; -#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED) BaseMemoryManager(std::shared_ptr alloc, MemoryAffinity memoryAffinity); std::shared_ptr& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; } @@ -98,4 +94,24 @@ protected: }; #endif -} //namespace armnn +#if defined(ARMCOMPUTEGPUFSA_ENABLED) +class GpuFsaMemoryManager : public BaseMemoryManager +{ +public: + GpuFsaMemoryManager() {} + virtual ~GpuFsaMemoryManager() {} + + GpuFsaMemoryManager(std::shared_ptr alloc) + : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer) + { + arm_compute::CLTensorAllocator::set_global_allocator(alloc.get()); + m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr); + } + +protected: + std::shared_ptr + CreateMemoryGroup(const std::shared_ptr& memoryManager) override; +}; +#endif + +} // namespace armnn diff --git a/src/backends/aclCommon/common.cmake b/src/backends/aclCommon/common.cmake index 89be236a7f..1ea14951a6 100644 --- a/src/backends/aclCommon/common.cmake +++ b/src/backends/aclCommon/common.cmake @@ -1,9 +1,9 @@ # -# Copyright © 2017 Arm Ltd. All rights reserved. +# Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # -if(ARMCOMPUTENEON OR ARMCOMPUTECL) +if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA) add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/aclCommon) list(APPEND armnnLibraries armnnAclCommon) list(APPEND armnnUnitTestLibraries armnnAclCommonUnitTests) diff --git a/src/backends/gpuFsa/CMakeLists.txt b/src/backends/gpuFsa/CMakeLists.txt index f5ddb34854..635b25b2d5 100644 --- a/src/backends/gpuFsa/CMakeLists.txt +++ b/src/backends/gpuFsa/CMakeLists.txt @@ -1,24 +1,26 @@ # -# Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # -if(ARMNNGPUFSA) +if(ARMCOMPUTEGPUFSA) list(APPEND armnnGpuFsaBackend_sources GpuFsaBackend.cpp GpuFsaBackend.hpp + GpuFsaBackendContext.cpp + GpuFsaBackendContext.hpp + GpuFsaBackendDefaultAllocator.hpp GpuFsaBackendId.hpp - GpuFsaTensorHandle.hpp - GpuFsaTensorHandle.cpp + GpuFsaContextControl.cpp + GpuFsaContextControl.hpp GpuFsaLayerSupport.cpp GpuFsaLayerSupport.hpp - GpuFsaMemoryManager.hpp - GpuFsaMemoryManager.cpp GpuFsaRegistryInitializer.cpp - GpuFsaWorkloadFactory.cpp - GpuFsaWorkloadFactory.hpp + GpuFsaTensorHandle.hpp GpuFsaTensorHandleFactory.cpp GpuFsaTensorHandleFactory.hpp + GpuFsaWorkloadFactory.cpp + GpuFsaWorkloadFactory.hpp ) add_subdirectory(workloads) @@ -30,6 +32,8 @@ if(ARMNNGPUFSA) else() list(APPEND armnnGpuFsaBackend_sources GpuFsaBackendId.hpp + GpuFsaLayerSupport.cpp + GpuFsaLayerSupport.hpp ) endif() diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp index 9c2f4a0df6..ae7ff0c243 100644 --- a/src/backends/gpuFsa/GpuFsaBackend.cpp +++ b/src/backends/gpuFsa/GpuFsaBackend.cpp @@ -1,23 +1,24 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "GpuFsaBackend.hpp" +#include "GpuFsaBackendContext.hpp" +#include "GpuFsaBackendDefaultAllocator.hpp" #include "GpuFsaBackendId.hpp" -#include "GpuFsaWorkloadFactory.hpp" #include "GpuFsaLayerSupport.hpp" #include "GpuFsaTensorHandleFactory.hpp" +#include "GpuFsaWorkloadFactory.hpp" -#include #include #include -#include -#include -#include - #include +#include + +#include + namespace armnn { @@ -27,6 +28,15 @@ const BackendId& GpuFsaBackend::GetIdStatic() return s_Id; } +IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const +{ + if (m_UsingCustomAllocator) + { + return std::make_unique(m_CustomAllocator); + } + return std::make_unique(std::make_unique()); +} + IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const { @@ -34,74 +44,142 @@ IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( } IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( - class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const + TensorHandleFactoryRegistry& registry) const { - auto memoryManager = std::make_shared(); - - tensorHandleFactoryRegistry.RegisterMemoryManager(memoryManager); + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); - auto factory = std::make_unique(memoryManager); - // Register copy and import factory pair - tensorHandleFactoryRegistry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId()); - // Register the factory - tensorHandleFactoryRegistry.RegisterFactory(std::move(factory)); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); return std::make_unique(PolymorphicPointerDowncast(memoryManager)); } -IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions&) const +IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory( + TensorHandleFactoryRegistry& registry, + const ModelOptions& modelOptions, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const { - return IBackendContextPtr{}; + IgnoreUnused(modelOptions); + + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast(MemorySource::Undefined)) + { + inputFlags = static_cast(MemorySource::Malloc); + } + if (outputFlags == static_cast(MemorySource::Undefined)) + { + outputFlags = static_cast(MemorySource::Malloc); + } + + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); + + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); + + return std::make_unique(PolymorphicPointerDowncast(memoryManager)); } -IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext( - const IRuntime::CreationOptions&, IBackendProfilingPtr&) +std::vector GpuFsaBackend::GetHandleFactoryPreferences() const { - return IBackendProfilingContextPtr{}; + return std::vector { GpuFsaTensorHandleFactory::GetIdStatic() }; } -IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const +void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) { - return std::make_unique(); + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); + } -IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const +void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) { - static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport}; - return layerSupport; + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast(MemorySource::Undefined)) + { + inputFlags = static_cast(MemorySource::Malloc); + } + if (outputFlags == static_cast(MemorySource::Undefined)) + { + outputFlags = static_cast(MemorySource::Malloc); + } + + std::shared_ptr memoryManager; + if (m_UsingCustomAllocator) + { + memoryManager = std::make_shared(m_CustomAllocator); + } + else + { + memoryManager = std::make_shared(std::make_unique()); + } + + std::unique_ptr factory = std::make_unique(memoryManager); + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::move(factory)); } -OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph, - const ModelOptions& modelOptions) const +IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const { - OptimizationViews optimizationViews(modelOptions); - optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); - - return optimizationViews; + return IBackendContextPtr{new GpuFsaBackendContext{options}}; } -std::vector GpuFsaBackend::GetHandleFactoryPreferences() const +IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext( + const IRuntime::CreationOptions&, IBackendProfilingPtr&) { - return std::vector { GpuFsaTensorHandleFactory::GetIdStatic() }; + return IBackendProfilingContextPtr{}; } -void GpuFsaBackend::RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry) +IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const { - auto memoryManager = std::make_shared(); - - registry.RegisterMemoryManager(memoryManager); - - auto factory = std::make_unique(memoryManager); - - // Register copy and import factory pair - registry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId()); - // Register the factory - registry.RegisterFactory(std::move(factory)); + static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport}; + return layerSupport; } std::unique_ptr GpuFsaBackend::GetDefaultAllocator() const { - return std::make_unique(); + return std::make_unique(); +} + +OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph, + const ModelOptions& modelOptions) const +{ + OptimizationViews optimizationViews(modelOptions); + optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); + return optimizationViews; } -} // namespace armnn \ No newline at end of file +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp index 803c6a4c66..6d886a12b1 100644 --- a/src/backends/gpuFsa/GpuFsaBackend.hpp +++ b/src/backends/gpuFsa/GpuFsaBackend.hpp @@ -1,56 +1,287 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once #include +#include + +#include +#include +#include +#include + +// System includes for mapping and unmapping memory +#include namespace armnn { +// add new capabilities here.. +const BackendCapabilities gpuFsaCapabilities("GpuFsa", + { + {"NonConstWeights", false}, + {"AsyncExecution", false}, + {"ProtectedContentAllocation", true}, + {"ConstantTensorsAsInputs", true}, + {"PreImportIOTensors", false}, + {"ExternallyManagedMemory", true}, + {"MultiAxisPacking", false}, + {"SingleAxisPacking", true} + }); + class GpuFsaBackend : public IBackendInternal { public: - GpuFsaBackend() = default; + GpuFsaBackend() : m_CustomAllocator(nullptr) {}; + GpuFsaBackend(std::shared_ptr allocator) + { + std::string err; + UseCustomMemoryAllocator(allocator, err); + } ~GpuFsaBackend() = default; static const BackendId& GetIdStatic(); - const BackendId& GetId() const override - { - return GetIdStatic(); - } + const BackendId& GetId() const override { return GetIdStatic(); } IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override; IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory( - const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override; + const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override; - IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory( - class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const override; + IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override; - IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override; + IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, + const ModelOptions& modelOptions, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const override; + + std::vector GetHandleFactoryPreferences() const override; + + void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override; - IBackendInternal::IBackendProfilingContextPtr - CreateBackendProfilingContext(const IRuntime::CreationOptions& creationOptions, - IBackendProfilingPtr& backendProfiling) override; + void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) override; + + IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override; + IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext( + const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override; IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override; OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph, const ModelOptions& modelOptions) const override; - std::vector GetHandleFactoryPreferences() const override; + std::unique_ptr GetDefaultAllocator() const override; - void RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry) override; + BackendCapabilities GetCapabilities() const override + { + return gpuFsaCapabilities; + }; - std::unique_ptr GetDefaultAllocator() const override; + virtual bool UseCustomMemoryAllocator(std::shared_ptr allocator, + armnn::Optional errMsg) override + { + IgnoreUnused(errMsg); + ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend"; + + // Set flag to signal the backend to use a custom memory allocator + m_CustomAllocator = std::make_shared(std::move(allocator)); + m_UsingCustomAllocator = true; + return m_UsingCustomAllocator; + } + + // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this + class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator + { + public: + GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr alloc) : m_CustomAllocator(alloc) + {} + // Inherited methods overridden: + void* allocate(size_t size, size_t alignment) override + { + auto alloc = m_CustomAllocator->allocate(size, alignment); + return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType()); + } + void free(void* ptr) override + { + auto hostMemPtr = m_AllocatedBufferMappings[ptr]; + clReleaseMemObject(static_cast(ptr)); + m_CustomAllocator->free(hostMemPtr); + } + std::unique_ptr make_region(size_t size, size_t alignment) override + { + auto hostMemPtr = m_CustomAllocator->allocate(size, alignment); + cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType()); + + return std::make_unique(cl::Buffer(buffer), + hostMemPtr, + m_CustomAllocator->GetMemorySourceType()); + } + private: + cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source) + { + // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + auto cachelineAlignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment); + + if (source == MemorySource::Malloc) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_HOST_ARM, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error)); + } + else if (source == MemorySource::DmaBuf) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_DMA_BUF_ARM, + CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM, + CL_TRUE, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + + std::to_string(error)); + } + else if (source == MemorySource::DmaBufProtected) + { + const cl_import_properties_arm importProperties[] = + { + CL_IMPORT_TYPE_ARM, + CL_IMPORT_TYPE_DMA_BUF_ARM, + CL_IMPORT_TYPE_PROTECTED_ARM, + CL_TRUE, + 0 + }; + cl_int error = CL_SUCCESS; + cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(), + CL_MEM_READ_WRITE, + importProperties, + memory, + roundedSize, + &error); + if (error == CL_SUCCESS) + { + m_AllocatedBufferMappings.insert(std::make_pair(static_cast(buffer), memory)); + return buffer; + } + throw armnn::Exception( + "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + + std::to_string(error)); + } + throw armnn::Exception( + "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator"); + } + std::shared_ptr m_CustomAllocator; + std::map m_AllocatedBufferMappings; + }; + + class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion + { + public: + // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access + ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source) + : ICLMemoryRegion(buffer.getInfo()) + { + _mem = buffer; + m_HostMemPtr = hostMemPtr; + m_MemorySource = source; + } + + // Inherited methods overridden : + void* ptr() override + { + return nullptr; + } + + void* map(cl::CommandQueue &q, bool blocking) override + { + armnn::IgnoreUnused(q, blocking); + if (m_HostMemPtr == nullptr) + { + throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr"); + } + if (_mapping != nullptr) + { + throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped"); + } + switch (m_MemorySource) + { + case armnn::MemorySource::Malloc: + _mapping = m_HostMemPtr; + return _mapping; + break; + case armnn::MemorySource::DmaBuf: + case armnn::MemorySource::DmaBufProtected: + // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd + _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast(m_HostMemPtr)), 0); + return _mapping; + break; + default: + throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source"); + break; + } + } -private: - // Private members + void unmap(cl::CommandQueue &q) override + { + armnn::IgnoreUnused(q); + switch (m_MemorySource) + { + case armnn::MemorySource::Malloc: + _mapping = nullptr; + break; + case armnn::MemorySource::DmaBuf: + case armnn::MemorySource::DmaBufProtected: + munmap(_mapping, _size); + _mapping = nullptr; + break; + default: + throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source"); + break; + } + } + private: + void* m_HostMemPtr = nullptr; + armnn::MemorySource m_MemorySource; + }; -protected: - // Protected members + std::shared_ptr m_CustomAllocator; + bool m_UsingCustomAllocator = false; }; } // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.cpp b/src/backends/gpuFsa/GpuFsaBackendContext.cpp new file mode 100644 index 0000000000..72b77e0d19 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackendContext.cpp @@ -0,0 +1,230 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaBackendContext.hpp" +#include "GpuFsaContextControl.hpp" + +#include +#include + +#include + +namespace armnn +{ + +struct GpuFsaBackendContext::GpuFsaContextControlWrapper +{ + GpuFsaContextControlWrapper(arm_compute::CLTuner* tuner, + arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle, + bool profilingEnabled) + : m_GpuFsaContextControl(tuner, heuristicsHandle, profilingEnabled) + {} + + bool Sync() + { + if (arm_compute::CLScheduler::get().context()() != NULL) + { + // Waits for all queued CL requests to finish before unloading the network they may be using. + try + { + // Coverity fix: arm_compute::CLScheduler::sync() may throw an exception of type cl::Error. + arm_compute::CLScheduler::get().sync(); + } + catch (const cl::Error&) + { + ARMNN_LOG(warning) << "Runtime::UnloadNetwork(): an error occurred while waiting for " + "the queued CL requests to finish"; + return false; + } + } + + return true; + } + + void ClearClCache() + { + if (arm_compute::CLScheduler::get().context()() != NULL) + { + // There are no loaded networks left, so clear the CL cache to free up memory + m_GpuFsaContextControl.ClearClCache(); + } + } + + GpuFsaContextControl m_GpuFsaContextControl; +}; + +GpuFsaBackendContext::GpuFsaBackendContext(const IRuntime::CreationOptions& options) + : IBackendContext(options) + , m_TuningFile() +{ + bool kernelProfiling = options.m_EnableGpuProfiling; + + arm_compute::CLTuner* tuner = nullptr; + arm_compute::CLGEMMHeuristicsHandle* mlgoTuner = nullptr; + bool useLegacyTunerAPI = options.m_GpuAccTunedParameters.get() != nullptr; + if (useLegacyTunerAPI) + { + auto clTunerParams = PolymorphicDowncast( + options.m_GpuAccTunedParameters.get()); + tuner = &clTunerParams->m_Tuner; + + if (tuner) + { + auto ConvertTuningLevel = [](IGpuAccTunedParameters::TuningLevel level, + armnn::IGpuAccTunedParameters::Mode mode) + { + if (mode == armnn::IGpuAccTunedParameters::Mode::UseTunedParameters) + { + return TuningLevel::None; + } + + switch(level) + { + case IGpuAccTunedParameters::TuningLevel::Rapid: + return TuningLevel::Rapid; + case IGpuAccTunedParameters::TuningLevel::Normal: + return TuningLevel::Normal; + case IGpuAccTunedParameters::TuningLevel::Exhaustive: + return TuningLevel::Exhaustive; + default: + { + ARMNN_ASSERT_MSG(false, "Tuning level not recognised."); + return TuningLevel::None; + } + } + }; + + TuningLevel tuningLevel = ConvertTuningLevel(clTunerParams->m_TuningLevel, clTunerParams->m_Mode); + ConfigureTuner(*tuner, tuningLevel); + } + } + else //New backend options API + { + const TuningLevel defaultTuningLevel = TuningLevel::None; + auto tuningLevel = defaultTuningLevel; + + ParseOptions(options.m_BackendOptions, "GpuFsa", [&](std::string name, const BackendOptions::Var& value) + { + if (name == "KernelProfilingEnabled") + { + kernelProfiling |= ParseBooleanBackendOption(value, false); + } else if (name == "TuningFile") + { + m_TuningFile = ParseStringBackendOption(value, ""); + } else if (name == "TuningLevel") + { + tuningLevel = ParseTuningLevel(value, defaultTuningLevel); + } + else if (name == "MLGOTuningFilePath") + { + m_MLGOTuningFile = ParseStringBackendOption(value, ""); + } + }); + + // Create the tuner, in tuning mode initially. + m_Tuner = std::make_unique(true); + + ConfigureTuner(*(m_Tuner.get()), tuningLevel); + + if (!m_TuningFile.empty()) + { + try + { + ARMNN_LOG(info) << "Loading Gpu tuning data from file: " << m_TuningFile; + m_Tuner->load_from_file(m_TuningFile.c_str()); + } + catch (const std::exception& e) + { + // Warn if not tuning, otherwise tuning will generate new params + if (tuningLevel == TuningLevel::None) + { + ARMNN_LOG(warning) << "Could not load GpuFsa tuner data file."; + } + } + } + + if (!m_MLGOTuningFile.empty()) + { + try + { + ARMNN_LOG(info) << "Loading Gpu MLGO tuning data from file: " << m_TuningFile; + if(m_MLGOTuner.reload_from_file(m_MLGOTuningFile.c_str())) + { + mlgoTuner = &m_MLGOTuner; + } + } + catch (const std::exception& e) + { + ARMNN_LOG(warning) << "Could not load GpuFsa MLGO tuner data file."; + } + } + + tuner = m_Tuner.get(); + } + + m_GpuFsaContextControlWrapper = std::make_unique( + tuner, + mlgoTuner, + kernelProfiling + ); +} + +bool GpuFsaBackendContext::BeforeLoadNetwork(NetworkId) +{ + return true; +} + +bool GpuFsaBackendContext::AfterLoadNetwork(NetworkId networkId) +{ + { + std::lock_guard lockGuard(m_Mutex); + m_NetworkIds.insert(networkId); + } + return true; +} + +bool GpuFsaBackendContext::BeforeUnloadNetwork(NetworkId) +{ + return m_GpuFsaContextControlWrapper->Sync(); +} + +bool GpuFsaBackendContext::AfterUnloadNetwork(NetworkId networkId) +{ + bool clearCache = false; + { + std::lock_guard lockGuard(m_Mutex); + m_NetworkIds.erase(networkId); + clearCache = m_NetworkIds.empty(); + } + + if (clearCache) + { + m_GpuFsaContextControlWrapper->ClearClCache(); + } + + return true; +} + +bool GpuFsaBackendContext::AfterEnqueueWorkload(NetworkId) +{ + return m_GpuFsaContextControlWrapper->Sync(); +} + +GpuFsaBackendContext::~GpuFsaBackendContext() +{ + if (m_Tuner && !m_TuningFile.empty()) + { + try + { + m_Tuner->save_to_file(m_TuningFile.c_str()); + } + catch(const std::exception& e) + { + ARMNN_LOG(warning) << "Could not save GpuFsa tuner data to file " << m_TuningFile; + } + } +} + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.hpp b/src/backends/gpuFsa/GpuFsaBackendContext.hpp new file mode 100644 index 0000000000..271688fd99 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackendContext.hpp @@ -0,0 +1,47 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include +#include + +#include +#include + +namespace armnn +{ + +class GpuFsaBackendContext : public IBackendContext +{ +public: + GpuFsaBackendContext(const IRuntime::CreationOptions& options); + + bool BeforeLoadNetwork(NetworkId networkId) override; + bool AfterLoadNetwork(NetworkId networkId) override; + + bool BeforeUnloadNetwork(NetworkId networkId) override; + bool AfterUnloadNetwork(NetworkId networkId) override; + + bool AfterEnqueueWorkload(NetworkId networkId) override; + + ~GpuFsaBackendContext() override; + +private: + std::mutex m_Mutex; + struct GpuFsaContextControlWrapper; + std::unique_ptr m_GpuFsaContextControlWrapper; + + std::unordered_set m_NetworkIds; + + std::unique_ptr m_Tuner; + std::string m_TuningFile; + +protected: + arm_compute::CLGEMMHeuristicsHandle m_MLGOTuner; + std::string m_MLGOTuningFile; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp new file mode 100644 index 0000000000..c57ff63b92 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp @@ -0,0 +1,51 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include + +#include +#include + +namespace armnn +{ + +/** +* Default Memory Allocator class returned from IBackendInternal::GetDefaultAllocator(MemorySource) +*/ +class GpuFsaBackendDefaultAllocator : public ICustomAllocator +{ +public: + GpuFsaBackendDefaultAllocator() = default; + + void* allocate(size_t size, size_t alignment = 0) override + { + IgnoreUnused(alignment); + cl_mem buf{ clCreateBuffer(arm_compute::CLScheduler::get().context().get(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + size, + nullptr, + nullptr)}; + return static_cast(buf); + } + + void free(void* ptr) override + { + ARM_COMPUTE_ERROR_ON(ptr == nullptr); + clReleaseMemObject(static_cast(ptr)); + } + + MemorySource GetMemorySourceType() override + { + return MemorySource::Gralloc; + } + + void* GetMemoryRegionAtOffset(void* buffer, size_t offset, size_t alignment = 0) override + { + IgnoreUnused(alignment); + return static_cast(buffer) + offset; + } +}; +} // namespace armnn \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaContextControl.cpp b/src/backends/gpuFsa/GpuFsaContextControl.cpp new file mode 100644 index 0000000000..795de5e14d --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaContextControl.cpp @@ -0,0 +1,163 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "GpuFsaContextControl.hpp" + +#include +#include +#include + +#include +#include + +#include + +namespace cl +{ +class Context; +class CommandQueue; +class Device; +} + +namespace armnn +{ + +GpuFsaContextControl::GpuFsaContextControl(arm_compute::CLTuner *tuner, + arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle, + bool profilingEnabled) + : m_Tuner(tuner) + , m_HeuristicsHandle(heuristicsHandle) + , m_ProfilingEnabled(profilingEnabled) +{ + try + { + std::vector platforms; + cl::Platform::get(&platforms); + + // Selects default platform for the first element. + cl::Platform::setDefault(platforms[0]); + + std::vector devices; + platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); + + // Selects default device for the first element. + cl::Device::setDefault(devices[0]); + } + catch (const cl::Error& clError) + { + throw ClRuntimeUnavailableException(fmt::format( + "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}", + clError.what(), clError.err())); + } + + // Removes the use of global CL context. + cl::Context::setDefault(cl::Context{}); + ARMNN_ASSERT(cl::Context::getDefault()() == NULL); + + // Removes the use of global CL command queue. + cl::CommandQueue::setDefault(cl::CommandQueue{}); + ARMNN_ASSERT(cl::CommandQueue::getDefault()() == NULL); + + // Always load the OpenCL runtime. + LoadOpenClRuntime(); +} + +GpuFsaContextControl::~GpuFsaContextControl() +{ + // Load the OpencCL runtime without the tuned parameters to free the memory for them. + try + { + UnloadOpenClRuntime(); + } + catch (const cl::Error& clError) + { + // This should not happen, it is ignored if it does. + + // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an + // exception of type std::length_error. + // Using stderr instead in this context as there is no point in nesting try-catch blocks here. + std::cerr << "A CL error occurred unloading the runtime tuner parameters: " + << clError.what() << ". CL error code is: " << clError.err() << std::endl; + } +} + +void GpuFsaContextControl::LoadOpenClRuntime() +{ + DoLoadOpenClRuntime(true); +} + +void GpuFsaContextControl::UnloadOpenClRuntime() +{ + DoLoadOpenClRuntime(false); +} + +void GpuFsaContextControl::DoLoadOpenClRuntime(bool updateTunedParameters) +{ + cl::Device device = cl::Device::getDefault(); + cl::Context context; + cl::CommandQueue commandQueue; + + if (arm_compute::CLScheduler::get().is_initialised() && arm_compute::CLScheduler::get().context()() != NULL) + { + // Wait for all queued CL requests to finish before reinitialising it. + arm_compute::CLScheduler::get().sync(); + } + + try + { + arm_compute::CLKernelLibrary::get().clear_programs_cache(); + // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no + // context references); it is initialised again, with a proper context, later. + arm_compute::CLScheduler::get().init(context, commandQueue, device); + arm_compute::CLKernelLibrary::get().init(".", context, device); + + { + // + // Here we replace the context with a new one in which + // the memory leak checks show it as an extra allocation but + // because of the scope of the leak checks, it doesn't count + // the disposal of the original object. On the other hand it + // does count the creation of this context which it flags + // as a memory leak. By adding the following line we prevent + // this to happen. + // + ARMNN_DISABLE_LEAK_CHECKING_IN_SCOPE(); + context = cl::Context(device); + } + + // NOTE: In this specific case profiling has to be enabled on the command queue + // in order for the CLTuner to work. + bool profilingNeededForClTuner = updateTunedParameters && m_Tuner && + m_Tuner->tune_new_kernels(); + + if (m_ProfilingEnabled || profilingNeededForClTuner) + { + // Create a new queue with profiling enabled. + commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE); + } + else + { + // Use default queue. + commandQueue = cl::CommandQueue(context, device); + } + } + catch (const cl::Error& clError) + { + throw ClRuntimeUnavailableException(fmt::format( + "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}", + clError.what(), clError.err())); + } + + // Note the first argument (path to cl source code) will be ignored as they should be embedded in the armcompute. + arm_compute::CLKernelLibrary::get().init(".", context, device); + arm_compute::CLScheduler::get().init(context, commandQueue, device, m_Tuner, m_HeuristicsHandle); +} + +void GpuFsaContextControl::ClearClCache() +{ + DoLoadOpenClRuntime(true); +} + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaContextControl.hpp b/src/backends/gpuFsa/GpuFsaContextControl.hpp new file mode 100644 index 0000000000..f77b1fbdd4 --- /dev/null +++ b/src/backends/gpuFsa/GpuFsaContextControl.hpp @@ -0,0 +1,42 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include + +namespace armnn +{ + +// ARM Compute OpenCL context control. +class GpuFsaContextControl +{ +public: + + GpuFsaContextControl(arm_compute::CLTuner* = nullptr, + arm_compute::CLGEMMHeuristicsHandle* = nullptr, + bool profilingEnabled = false); + + virtual ~GpuFsaContextControl(); + + void LoadOpenClRuntime(); + + // Users should call this (after freeing all of the cl::Context objects they use) + // to release the cached memory used by the compute library. + void UnloadOpenClRuntime(); + + // Clear the CL cache, without losing the tuned parameter settings. + void ClearClCache(); + +private: + + void DoLoadOpenClRuntime(bool updateTunedParameters); + + arm_compute::CLTuner* m_Tuner; + arm_compute::CLGEMMHeuristicsHandle* m_HeuristicsHandle; + + bool m_ProfilingEnabled; +}; + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp b/src/backends/gpuFsa/GpuFsaMemoryManager.cpp deleted file mode 100644 index 4eefb87d88..0000000000 --- a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp +++ /dev/null @@ -1,101 +0,0 @@ -// -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// -#include "GpuFsaMemoryManager.hpp" - -#include - -#include - -namespace armnn -{ - -GpuFsaMemoryManager::GpuFsaMemoryManager() -{} - -GpuFsaMemoryManager::~GpuFsaMemoryManager() -{} - -GpuFsaMemoryManager::Pool* GpuFsaMemoryManager::Manage(unsigned int numBytes) -{ - if (!m_FreePools.empty()) - { - Pool* res = m_FreePools.back(); - m_FreePools.pop_back(); - res->Reserve(numBytes); - return res; - } - else - { - m_Pools.push_front(Pool(numBytes)); - return &m_Pools.front(); - } -} - -void GpuFsaMemoryManager::Allocate(GpuFsaMemoryManager::Pool* pool) -{ - ARMNN_ASSERT(pool); - m_FreePools.push_back(pool); -} - -void* GpuFsaMemoryManager::GetPointer(GpuFsaMemoryManager::Pool* pool) -{ - return pool->GetPointer(); -} - -void GpuFsaMemoryManager::Acquire() -{ - for (Pool &pool: m_Pools) - { - pool.Acquire(); - } -} - -void GpuFsaMemoryManager::Release() -{ - for (Pool &pool: m_Pools) - { - pool.Release(); - } -} - -GpuFsaMemoryManager::Pool::Pool(unsigned int numBytes) - : m_Size(numBytes), - m_Pointer(nullptr) -{} - -GpuFsaMemoryManager::Pool::~Pool() -{ - if (m_Pointer) - { - Release(); - } -} - -void* GpuFsaMemoryManager::Pool::GetPointer() -{ - ARMNN_ASSERT_MSG(m_Pointer, "GpuFsaMemoryManager::Pool::GetPointer() called when memory not acquired"); - return m_Pointer; -} - -void GpuFsaMemoryManager::Pool::Reserve(unsigned int numBytes) -{ - ARMNN_ASSERT_MSG(!m_Pointer, "GpuFsaMemoryManager::Pool::Reserve() cannot be called after memory acquired"); - m_Size = std::max(m_Size, numBytes); -} - -void GpuFsaMemoryManager::Pool::Acquire() -{ - ARMNN_ASSERT_MSG(!m_Pointer, "GpuFsaMemoryManager::Pool::Acquire() called when memory already acquired"); - m_Pointer = ::operator new(size_t(m_Size)); -} - -void GpuFsaMemoryManager::Pool::Release() -{ - ARMNN_ASSERT_MSG(m_Pointer, "GpuFsaMemoryManager::Pool::Release() called when memory not acquired"); - ::operator delete(m_Pointer); - m_Pointer = nullptr; -} - -} \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp b/src/backends/gpuFsa/GpuFsaMemoryManager.hpp deleted file mode 100644 index 636b839a51..0000000000 --- a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// -#pragma once - -#include - -#include -#include - -namespace armnn -{ - -// A dummy MemoryManager which will be deleted once the GpuFsa Backend is integrated with ClMemoryManager -class GpuFsaMemoryManager : public IMemoryManager -{ -public: - GpuFsaMemoryManager(); - virtual ~GpuFsaMemoryManager(); - - class Pool; - - Pool* Manage(unsigned int numBytes); - - void Allocate(Pool *pool); - - void* GetPointer(Pool *pool); - - void Acquire() override; - void Release() override; - - class Pool - { - public: - Pool(unsigned int numBytes); - ~Pool(); - - void Acquire(); - void Release(); - - void* GetPointer(); - - void Reserve(unsigned int numBytes); - - private: - unsigned int m_Size; - void* m_Pointer; - }; - -private: - GpuFsaMemoryManager(const GpuFsaMemoryManager&) = delete; // Noncopyable - GpuFsaMemoryManager& operator=(const GpuFsaMemoryManager&) = delete; // Noncopyable - - std::forward_list m_Pools; - std::vector m_FreePools; -}; - -} diff --git a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp index 875b7d7112..9efb300576 100644 --- a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp +++ b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp @@ -1,9 +1,11 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // + #include "GpuFsaBackend.hpp" #include + namespace { using namespace armnn; diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp b/src/backends/gpuFsa/GpuFsaTensorHandle.cpp deleted file mode 100644 index e806be49bb..0000000000 --- a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// -#include "GpuFsaTensorHandle.hpp" - -namespace armnn -{ -GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo, - std::shared_ptr& memoryManager) - : m_TensorInfo(tensorInfo) - , m_MemoryManager(memoryManager) - , m_Pool(nullptr) - , m_UnmanagedMemory(nullptr) - , m_ImportFlags(static_cast(MemorySource::Undefined)) - , m_Imported(false) - , m_IsImportEnabled(false) -{} - -GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo, - MemorySourceFlags importFlags) - : m_TensorInfo(tensorInfo) - , m_Pool(nullptr) - , m_UnmanagedMemory(nullptr) - , m_ImportFlags(importFlags) - , m_Imported(false) - , m_IsImportEnabled(true) -{} - -GpuFsaTensorHandle::~GpuFsaTensorHandle() -{ - if (!m_Pool) - { - // unmanaged - if (!m_Imported) - { - ::operator delete(m_UnmanagedMemory); - } - } -} - -void GpuFsaTensorHandle::Manage() -{ - if (!m_IsImportEnabled) - { - ARMNN_ASSERT_MSG(!m_Pool, "GpuFsaTensorHandle::Manage() called twice"); - ARMNN_ASSERT_MSG(!m_UnmanagedMemory, "GpuFsaTensorHandle::Manage() called after Allocate()"); - - m_Pool = m_MemoryManager->Manage(m_TensorInfo.GetNumBytes()); - } -} - -void GpuFsaTensorHandle::Allocate() -{ - // If import is enabled, do not allocate the tensor - if (!m_IsImportEnabled) - { - - if (!m_UnmanagedMemory) - { - if (!m_Pool) - { - // unmanaged - m_UnmanagedMemory = ::operator new(m_TensorInfo.GetNumBytes()); - } - else - { - m_MemoryManager->Allocate(m_Pool); - } - } - else - { - throw InvalidArgumentException("GpuFsaTensorHandle::Allocate Trying to allocate a GpuFsaTensorHandle" - "that already has allocated memory."); - } - } -} - -const void* GpuFsaTensorHandle::Map(bool /*unused*/) const -{ - return GetPointer(); -} - -void* GpuFsaTensorHandle::GetPointer() const -{ - if (m_UnmanagedMemory) - { - return m_UnmanagedMemory; - } - else if (m_Pool) - { - return m_MemoryManager->GetPointer(m_Pool); - } - else - { - throw NullPointerException("GpuFsaTensorHandle::GetPointer called on unmanaged, unallocated tensor handle"); - } -} - -void GpuFsaTensorHandle::CopyOutTo(void* dest) const -{ - const void *src = GetPointer(); - ARMNN_ASSERT(src); - memcpy(dest, src, m_TensorInfo.GetNumBytes()); -} - -void GpuFsaTensorHandle::CopyInFrom(const void* src) -{ - void *dest = GetPointer(); - ARMNN_ASSERT(dest); - memcpy(dest, src, m_TensorInfo.GetNumBytes()); -} - -bool GpuFsaTensorHandle::Import(void* memory, MemorySource source) -{ - if (m_ImportFlags & static_cast(source)) - { - if (m_IsImportEnabled && source == MemorySource::Malloc) - { - // Check memory alignment - if(!CanBeImported(memory, source)) - { - if (m_Imported) - { - m_Imported = false; - m_UnmanagedMemory = nullptr; - } - return false; - } - - // m_UnmanagedMemory not yet allocated. - if (!m_Imported && !m_UnmanagedMemory) - { - m_UnmanagedMemory = memory; - m_Imported = true; - return true; - } - - // m_UnmanagedMemory initially allocated with Allocate(). - if (!m_Imported && m_UnmanagedMemory) - { - return false; - } - - // m_UnmanagedMemory previously imported. - if (m_Imported) - { - m_UnmanagedMemory = memory; - return true; - } - } - } - - return false; -} - -bool GpuFsaTensorHandle::CanBeImported(void* memory, MemorySource source) -{ - if (m_ImportFlags & static_cast(source)) - { - if (m_IsImportEnabled && source == MemorySource::Malloc) - { - uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType()); - if (reinterpret_cast(memory) % alignment) - { - return false; - } - return true; - } - } - return false; -} - - - -} \ No newline at end of file diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp index b2da50a467..d6901d1225 100644 --- a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp +++ b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp @@ -1,83 +1,361 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once -#include +#include +#include -#include "GpuFsaMemoryManager.hpp" +#include +#include + +#include +#include +#include +#include +#include +#include + +#include namespace armnn { -// An implementation of ITensorHandle with simple "bump the pointer" memory-management behaviour -// Will be refactored to look more like ClTensorHandle.hpp and use ClMemoryManager instead of GpuFsaMemoryManager -class GpuFsaTensorHandle : public ITensorHandle +class GpuFsaTensorHandle : public IClTensorHandle { public: - GpuFsaTensorHandle(const TensorInfo& tensorInfo, std::shared_ptr& memoryManager); + GpuFsaTensorHandle(const TensorInfo& tensorInfo) + : m_ImportFlags(static_cast(MemorySource::Undefined)), + m_Imported(false), + m_IsImportEnabled(false) + { + armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo); + } - GpuFsaTensorHandle(const TensorInfo& tensorInfo, MemorySourceFlags importFlags); + GpuFsaTensorHandle(const TensorInfo& tensorInfo, + DataLayout dataLayout, + MemorySourceFlags importFlags = static_cast(MemorySource::Undefined)) + : m_ImportFlags(importFlags), + m_Imported(false), + m_IsImportEnabled(false) + { + armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout); + } - ~GpuFsaTensorHandle(); + arm_compute::CLTensor& GetTensor() override { return m_Tensor; } + arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; } + virtual void Allocate() override + { + // If we have enabled Importing, don't allocate the tensor + if (m_IsImportEnabled) + { + throw MemoryImportException("GpuFsaTensorHandle::Attempting to allocate memory when importing"); + } + else + { + armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor); + } - virtual void Manage() override; + } - virtual void Allocate() override; + virtual void Manage() override + { + // If we have enabled Importing, don't manage the tensor + if (m_IsImportEnabled) + { + throw MemoryImportException("GpuFsaTensorHandle::Attempting to manage memory when importing"); + } + else + { + assert(m_MemoryGroup != nullptr); + m_MemoryGroup->manage(&m_Tensor); + } + } - virtual ITensorHandle* GetParent() const override + virtual const void* Map(bool blocking = true) const override { - return nullptr; + const_cast(&m_Tensor)->map(blocking); + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); } - virtual const void* Map(bool /* blocking = true */) const override; - using ITensorHandle::Map; + virtual void Unmap() const override { const_cast(&m_Tensor)->unmap(); } + + virtual ITensorHandle* GetParent() const override { return nullptr; } + + virtual arm_compute::DataType GetDataType() const override + { + return m_Tensor.info()->data_type(); + } - virtual void Unmap() const override - {} + virtual void SetMemoryGroup(const std::shared_ptr& memoryGroup) override + { + m_MemoryGroup = PolymorphicPointerDowncast(memoryGroup); + } TensorShape GetStrides() const override { - return GetUnpaddedTensorStrides(m_TensorInfo); + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); } TensorShape GetShape() const override { - return m_TensorInfo.GetShape(); + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); } - const TensorInfo& GetTensorInfo() const + void SetImportFlags(MemorySourceFlags importFlags) { - return m_TensorInfo; + m_ImportFlags = importFlags; } - virtual MemorySourceFlags GetImportFlags() const override + MemorySourceFlags GetImportFlags() const override { return m_ImportFlags; } - virtual bool Import(void* memory, MemorySource source) override; - virtual bool CanBeImported(void* memory, MemorySource source) override; + void SetImportEnabledFlag(bool importEnabledFlag) + { + m_IsImportEnabled = importEnabledFlag; + } -private: - // Only used for testing - void CopyOutTo(void*) const override; - void CopyInFrom(const void*) override; + virtual bool Import(void* /*memory*/, MemorySource source) override + { + if (m_ImportFlags & static_cast(source)) + { + throw MemoryImportException("GpuFsaTensorHandle::Incorrect import flag"); + } + m_Imported = false; + return false; + } - void* GetPointer() const; + virtual bool CanBeImported(void* /*memory*/, MemorySource /*source*/) override + { + // This TensorHandle can never import. + return false; + } - GpuFsaTensorHandle(const GpuFsaTensorHandle& other) = delete; // noncopyable - GpuFsaTensorHandle& operator=(const GpuFsaTensorHandle& other) = delete; //noncopyable +private: + // Only used for testing + void CopyOutTo(void* memory) const override + { + const_cast(this)->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + const_cast(this)->Unmap(); + } - TensorInfo m_TensorInfo; + // Only used for testing + void CopyInFrom(const void* memory) override + { + this->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + this->Unmap(); + } - std::shared_ptr m_MemoryManager; - GpuFsaMemoryManager::Pool* m_Pool; - mutable void* m_UnmanagedMemory; + arm_compute::CLTensor m_Tensor; + std::shared_ptr m_MemoryGroup; MemorySourceFlags m_ImportFlags; bool m_Imported; bool m_IsImportEnabled; }; -} \ No newline at end of file +class GpuFsaSubTensorHandle : public IClTensorHandle +{ +public: + GpuFsaSubTensorHandle(IClTensorHandle* parent, + const arm_compute::TensorShape& shape, + const arm_compute::Coordinates& coords) + : m_Tensor(&parent->GetTensor(), shape, coords) + { + parentHandle = parent; + } + + arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; } + arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; } + + virtual void Allocate() override {} + virtual void Manage() override {} + + virtual const void* Map(bool blocking = true) const override + { + const_cast(&m_Tensor)->map(blocking); + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override { const_cast(&m_Tensor)->unmap(); } + + virtual ITensorHandle* GetParent() const override { return parentHandle; } + + virtual arm_compute::DataType GetDataType() const override + { + return m_Tensor.info()->data_type(); + } + + virtual void SetMemoryGroup(const std::shared_ptr&) override {} + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + +private: + // Only used for testing + void CopyOutTo(void* memory) const override + { + const_cast(this)->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(), + static_cast(memory)); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + const_cast(this)->Unmap(); + } + + // Only used for testing + void CopyInFrom(const void* memory) override + { + this->Map(true); + switch(this->GetDataType()) + { + case arm_compute::DataType::F32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::U8: + case arm_compute::DataType::QASYMM8: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::F16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::QSYMM8: + case arm_compute::DataType::QSYMM8_PER_CHANNEL: + case arm_compute::DataType::QASYMM8_SIGNED: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S16: + case arm_compute::DataType::QSYMM16: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + case arm_compute::DataType::S32: + armcomputetensorutils::CopyArmComputeITensorData(static_cast(memory), + this->GetTensor()); + break; + default: + { + throw armnn::UnimplementedException(); + } + } + this->Unmap(); + } + + mutable arm_compute::CLSubTensor m_Tensor; + ITensorHandle* parentHandle = nullptr; +}; + +} // namespace armnn diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp index cd9d8cd64d..c1a34d24e5 100644 --- a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp +++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp @@ -1,32 +1,50 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "GpuFsaTensorHandle.hpp" #include "GpuFsaTensorHandleFactory.hpp" -#include "armnn/Logging.hpp" -#include - namespace armnn { using FactoryId = ITensorHandleFactory::FactoryId; -const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic() -{ - static const FactoryId s_Id(GpuFsaTensorHandleFactoryId()); - return s_Id; -} - std::unique_ptr GpuFsaTensorHandleFactory::CreateSubTensorHandle(ITensorHandle& parent, - const TensorShape& subTensorShape, - const unsigned int* subTensorOrigin) - const + const TensorShape& subTensorShape, + const unsigned int* subTensorOrigin) const { - IgnoreUnused(parent, subTensorShape, subTensorOrigin); - return nullptr; + arm_compute::Coordinates coords; + arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape); + + coords.set_num_dimensions(subTensorShape.GetNumDimensions()); + for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); ++i) + { + // Arm compute indexes tensor coords in reverse order. + unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1; + coords.set(i, armnn::numeric_cast(subTensorOrigin[revertedIndex])); + } + + const arm_compute::TensorShape parentShape = armcomputetensorutils::BuildArmComputeTensorShape(parent.GetShape()); + + // In order for ACL to support subtensors the concat axis cannot be on x or y and the values of x and y + // must match the parent shapes + if (coords.x() != 0 || coords.y() != 0) + { + return nullptr; + } + if ((parentShape.x() != shape.x()) || (parentShape.y() != shape.y())) + { + return nullptr; + } + + if (!::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, parentShape, coords, shape)) + { + return nullptr; + } + + return std::make_unique(PolymorphicDowncast(&parent), shape, coords); } std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const @@ -43,25 +61,32 @@ std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(con std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo, const bool IsMemoryManaged) const { - std::unique_ptr handle = std::make_unique(tensorInfo, m_MemoryManager); + std::unique_ptr tensorHandle = std::make_unique(tensorInfo); if (!IsMemoryManaged) { ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed."; } - return handle; + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + return tensorHandle; } std::unique_ptr GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo, DataLayout dataLayout, const bool IsMemoryManaged) const { - IgnoreUnused(dataLayout); - std::unique_ptr handle = std::make_unique(tensorInfo, m_MemoryManager); + std::unique_ptr tensorHandle = std::make_unique(tensorInfo, dataLayout); if (!IsMemoryManaged) { ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed."; } - return handle; + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + return tensorHandle; +} + +const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic() +{ + static const FactoryId s_Id(GpuFsaTensorHandleFactoryId()); + return s_Id; } const FactoryId& GpuFsaTensorHandleFactory::GetId() const @@ -71,7 +96,7 @@ const FactoryId& GpuFsaTensorHandleFactory::GetId() const bool GpuFsaTensorHandleFactory::SupportsSubTensors() const { - return false; + return true; } MemorySourceFlags GpuFsaTensorHandleFactory::GetExportFlags() const diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp index 9f88de598b..93a44259f6 100644 --- a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp +++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp @@ -1,14 +1,13 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // - #pragma once -#include "GpuFsaMemoryManager.hpp" - #include +#include + namespace armnn { diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp index 687c8c0ac8..6d13879f51 100644 --- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp +++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp @@ -1,10 +1,10 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // + #include -#include -#include + #include "GpuFsaWorkloadFactory.hpp" #include "GpuFsaBackendId.hpp" #include "GpuFsaTensorHandle.hpp" @@ -17,11 +17,9 @@ namespace static const BackendId s_Id{GpuFsaBackendId()}; } template -std::unique_ptr GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& descriptor, - const WorkloadInfo& info) const +std::unique_ptr GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& /*descriptor*/, + const WorkloadInfo& /*info*/) const { - IgnoreUnused(descriptor); - IgnoreUnused(info); return nullptr; } @@ -64,51 +62,29 @@ bool GpuFsaWorkloadFactory::IsLayerSupported(const Layer& layer, return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported); } -bool GpuFsaWorkloadFactory::IsLayerSupported(const IConnectableLayer& layer, - Optional dataType, - std::string& outReasonIfUnsupported, - const ModelOptions& modelOptions) -{ - return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported, modelOptions); -} - std::unique_ptr GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo, - const bool isMemoryManaged) const + const bool /*isMemoryManaged*/) const { - if (isMemoryManaged) - { - return std::make_unique(tensorInfo, m_MemoryManager); - } - else - { - return std::make_unique(tensorInfo, static_cast(MemorySource::Malloc)); - } + std::unique_ptr tensorHandle = std::make_unique(tensorInfo); + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + + return tensorHandle; } std::unique_ptr GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo, DataLayout dataLayout, - const bool isMemoryManaged) const + const bool /*isMemoryManaged*/) const { - IgnoreUnused(dataLayout); + std::unique_ptr tensorHandle = std::make_unique(tensorInfo, dataLayout); + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); - if (isMemoryManaged) - { - return std::make_unique(tensorInfo, m_MemoryManager); - } - else - { - return std::make_unique(tensorInfo, static_cast(MemorySource::Malloc)); - } + return tensorHandle; } -std::unique_ptr GpuFsaWorkloadFactory::CreateWorkload(LayerType type, - const QueueDescriptor& descriptor, - const WorkloadInfo& info) const +std::unique_ptr GpuFsaWorkloadFactory::CreateWorkload(LayerType /*type*/, + const QueueDescriptor& /*descriptor*/, + const WorkloadInfo& /*info*/) const { - IgnoreUnused(type); - IgnoreUnused(descriptor); - IgnoreUnused(info); - return nullptr; } diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp index 0d80f0363c..9b97070766 100644 --- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp +++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp @@ -1,14 +1,12 @@ // -// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once -#include "GpuFsaMemoryManager.hpp" +#include #include -#include -#include namespace armnn { @@ -28,19 +26,13 @@ public: Optional dataType, std::string& outReasonIfUnsupported); - static bool IsLayerSupported(const IConnectableLayer& layer, - Optional dataType, - std::string& outReasonIfUnsupported, - const ModelOptions& modelOptions); - bool SupportsSubTensors() const override { return false; } ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateSubTensorHandle instead") - std::unique_ptr CreateSubTensorHandle(ITensorHandle& parent, - TensorShape const& subTensorShape, - unsigned int const* subTensorOrigin) const override + std::unique_ptr CreateSubTensorHandle(ITensorHandle& /*parent*/, + TensorShape const& /*subTensorShape*/, + unsigned int const* /*subTensorOrigin*/) const override { - IgnoreUnused(parent, subTensorShape, subTensorOrigin); return nullptr; } diff --git a/src/backends/gpuFsa/backend.cmake b/src/backends/gpuFsa/backend.cmake index 589af19c22..2f4f5fbc7b 100644 --- a/src/backends/gpuFsa/backend.cmake +++ b/src/backends/gpuFsa/backend.cmake @@ -1,12 +1,12 @@ # -# Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/gpuFsa) list(APPEND armnnLibraries armnnGpuFsaBackend) -if(ARMNNGPUFSA) +if(ARMCOMPUTEGPUFSA) list(APPEND armnnLibraries armnnGpuFsaBackendWorkloads) list(APPEND armnnUnitTestLibraries armnnGpuFsaBackendUnitTests) else() diff --git a/src/backends/gpuFsa/backend.mk b/src/backends/gpuFsa/backend.mk index 840e10338c..78ba7ba167 100644 --- a/src/backends/gpuFsa/backend.mk +++ b/src/backends/gpuFsa/backend.mk @@ -1,5 +1,5 @@ # -# Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # @@ -8,23 +8,23 @@ # file in the root of ArmNN # The variable to enable/disable the GPU Dynamic Fusion backend -# (ARMNN_GPU_FSA_ENABLED is declared in android-nn-driver/Android.mk) -ifeq ($(ARMNN_GPU_FSA_ENABLED),1) +# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk) +ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1) -# ARMNN_GPU_FSA_ENABLED == 1 +# ARMNN_COMPUTE_GPUFSA_ENABLED == 1 # Include the source files for the GPU Dynamic Fusion backend BACKEND_SOURCES := \ GpuFsaBackend.cpp \ + GpuFsaBackendContext.cpp \ + GpuFsaContextControl.cpp \ GpuFsaLayerSupport.cpp \ - GpuFsaMemoryManager.cpp \ GpuFsaRegistryInitializer.cpp \ - GpuFsaTensorHandle.cpp \ GpuFsaTensorHandleFactory.cpp \ GpuFsaWorkloadFactory.cpp else -# ARMNN_GPU_FSA_ENABLED == 0 +# ARMNN_COMPUTE_GPUFSA_ENABLED == 0 # No source file will be compiled for the GPU Dynamic Fusion backend BACKEND_SOURCES := @@ -36,10 +36,10 @@ endif # up by the Android.mk file in the root of ArmNN # The variable to enable/disable the GPU Dynamic Fusion backend -# (ARMNN_GPU_FSA_ENABLED is declared in android-nn-driver/Android.mk) -ifeq ($(ARMNN_GPU_FSA_ENABLED),1) +# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk) +ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1) -# ARMNN_GPU_FSA_ENABLED == 1 +# ARMNN_COMPUTE_GPUFSA_ENABLED == 1 # Include the source files for the GPU Dynamic Fusion backend tests BACKEND_TEST_SOURCES := \ @@ -49,7 +49,7 @@ BACKEND_TEST_SOURCES := \ test/GpuFsaOptimizedNetworkTests.cpp else -# ARMNN_GPU_FSA_ENABLED == 0 +# ARMNN_COMPUTE_GPUFSA_ENABLED == 0 # No source file will be compiled for the GPU Dynamic Fusion backend tests BACKEND_TEST_SOURCES := diff --git a/src/backends/gpuFsa/test/CMakeLists.txt b/src/backends/gpuFsa/test/CMakeLists.txt index c600589768..66091e90df 100644 --- a/src/backends/gpuFsa/test/CMakeLists.txt +++ b/src/backends/gpuFsa/test/CMakeLists.txt @@ -1,9 +1,10 @@ # -# Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT # list(APPEND armnnGpuFsaBackendUnitTests_sources + GpuFsaDefaultAllocatorTests.cpp GpuFsaEndToEndTests.cpp GpuFsaLayerTests.cpp GpuFsaLayerSupportTests.cpp diff --git a/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp new file mode 100644 index 0000000000..1f603e2718 --- /dev/null +++ b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp @@ -0,0 +1,193 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include +#include +#include +#include +#include +// Requires the OpenCl backend to be included (GpuFsa) +#include +#include +#include +#include +#include + +using namespace armnn; + +namespace +{ + +TEST_SUITE("DefaultAllocatorTests") +{ + +TEST_CASE("DefaultAllocatorTest") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +TEST_CASE("DefaultAllocatorTestMulti") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto* inputPtr2 = reinterpret_cast(alignedInputPtr2); + std::fill_n(inputPtr2, numElements, number); + CHECK(inputPtr2[0] == 3); + CHECK(inputPtr2[1] == 3); + + // No overlap + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +TEST_CASE("DefaultAllocatorTestMock") +{ + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + IRuntimePtr run = IRuntime::Create(options); + + // Initialize Mock Backend + MockBackendInitialiser initialiser; + auto factoryFun = BackendRegistryInstance().GetFactory(MockBackend().GetIdStatic()); + ARMNN_ASSERT(factoryFun != nullptr); + auto backend = factoryFun(); + auto defaultAllocator = backend->GetDefaultAllocator(); + + // GetMemorySourceType + CHECK(defaultAllocator->GetMemorySourceType() == MemorySource::Malloc); + + size_t totalBytes = 1 * sizeof(float); + // Allocate + void* ptr = defaultAllocator->allocate(totalBytes, 0); + + // GetMemoryRegionAtOffset + CHECK(defaultAllocator->GetMemoryRegionAtOffset(ptr, 0, 0)); + + // Free + defaultAllocator->free(ptr); + + // Clean up + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.Deregister(MockBackend().GetIdStatic()); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +} + + +TEST_SUITE("GpuFsaDefaultAllocatorTests") +{ + +TEST_CASE("GpuFsaDefaultAllocatorTest") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +TEST_CASE("GpuFsaDefaultAllocatorTestMulti") +{ + float number = 3; + + TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + auto customAllocator = std::make_shared(); + options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}}; + IRuntimePtr run = IRuntime::Create(options); + + // Creates structures for input & output + unsigned int numElements = inputTensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0); + + auto* inputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(inputPtr, numElements, number); + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto* inputPtr2 = reinterpret_cast(alignedInputPtr2); + std::fill_n(inputPtr2, numElements, number); + CHECK(inputPtr2[0] == 3); + CHECK(inputPtr2[1] == 3); + + // No overlap + CHECK(inputPtr[0] == 3); + CHECK(inputPtr[1] == 3); + + auto& backendRegistry = armnn::BackendRegistryInstance(); + backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic()); +} + +} + +} // namespace armnn \ No newline at end of file -- cgit v1.2.1