aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCathal Corbett <cathal.corbett@arm.com>2023-01-09 12:47:48 +0000
committerCathal Corbett <cathal.corbett@arm.com>2023-01-12 11:58:50 +0000
commit2b32a69f3aac5496d0a966d9740cb4854504f3d9 (patch)
tree0ffc0710d5dd0feb9aa35be3defc8111d1c035b8
parentd69c1c595375b904a7f19f562ac1d54098184b4e (diff)
downloadarmnn-2b32a69f3aac5496d0a966d9740cb4854504f3d9.tar.gz
IVGCVSW-7380 Update the GpuFsa Skeleton to build and load ACL
* Reuse cl backend to be able to create ClRuntime, ClContexts etc. for the new GpuFsa backend. * Can access code defined in the experimental interface dynamic_fusion. * No BackendModelContext as model/backend options not required for now. * Any of the serializer and deserializer is emitted as context caching not required. * No ImportTensorHandle and ImportTensorHandleFactory for now. * Moved tuning and IClTensorHandle code to aclCommon as it is accessed by both cl and gpuFsa. * Small code refactor of cl backend. * Added DefaultAllocatorTests to GpuFsa backend. Signed-off-by: Cathal Corbett <cathal.corbett@arm.com> Change-Id: I6ae591360e9d2a783aafd06e2d7bf8e0b3e623ee
-rw-r--r--CMakeLists.txt2
-rw-r--r--cmake/GlobalConfig.cmake19
-rw-r--r--src/armnn/Network.cpp14
-rw-r--r--src/backends/aclCommon/BaseMemoryManager.cpp14
-rw-r--r--src/backends/aclCommon/BaseMemoryManager.hpp34
-rw-r--r--src/backends/aclCommon/common.cmake4
-rw-r--r--src/backends/gpuFsa/CMakeLists.txt20
-rw-r--r--src/backends/gpuFsa/GpuFsaBackend.cpp172
-rw-r--r--src/backends/gpuFsa/GpuFsaBackend.hpp271
-rw-r--r--src/backends/gpuFsa/GpuFsaBackendContext.cpp230
-rw-r--r--src/backends/gpuFsa/GpuFsaBackendContext.hpp47
-rw-r--r--src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp51
-rw-r--r--src/backends/gpuFsa/GpuFsaContextControl.cpp163
-rw-r--r--src/backends/gpuFsa/GpuFsaContextControl.hpp42
-rw-r--r--src/backends/gpuFsa/GpuFsaMemoryManager.cpp101
-rw-r--r--src/backends/gpuFsa/GpuFsaMemoryManager.hpp59
-rw-r--r--src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp4
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandle.cpp176
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandle.hpp350
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp67
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp7
-rw-r--r--src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp58
-rw-r--r--src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp18
-rw-r--r--src/backends/gpuFsa/backend.cmake4
-rw-r--r--src/backends/gpuFsa/backend.mk22
-rw-r--r--src/backends/gpuFsa/test/CMakeLists.txt3
-rw-r--r--src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp193
27 files changed, 1582 insertions, 563 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 476e080442..19626f2862 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -502,7 +502,7 @@ endif()
install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
target_link_libraries(armnn PUBLIC ${ARMCOMPUTE_LIBRARIES})
endif()
diff --git a/cmake/GlobalConfig.cmake b/cmake/GlobalConfig.cmake
index bc9117f702..8a1211246c 100644
--- a/cmake/GlobalConfig.cmake
+++ b/cmake/GlobalConfig.cmake
@@ -10,7 +10,7 @@ option(BUILD_TESTS "Build test applications" OFF)
option(BUILD_FOR_COVERAGE "Use no optimization and output .gcno and .gcda files" OFF)
option(ARMCOMPUTENEON "Build with ARM Compute NEON support" OFF)
option(ARMCOMPUTECL "Build with ARM Compute OpenCL support" OFF)
-option(ARMNNGPUFSA "Build with GPU Dynamic Fusion Backend" OFF)
+option(ARMCOMPUTEGPUFSA "Build with GPU Dynamic Fusion Backend" OFF)
option(ARMNNREF "Build with ArmNN reference support" ON)
option(ARMNNTOSAREF "Build with TOSA reference support" OFF)
option(PROFILING_BACKEND_STREAMLINE "Forward the armNN profiling events to DS-5/Streamline as annotations" OFF)
@@ -261,7 +261,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/profiling)
# ARM Compute
# Note that ARM Compute has a different folder layout depending on the branch but also on
# whether it comes from a prepackaged archive (this is why we add several hints below)
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
find_path(ARMCOMPUTE_INCLUDE arm_compute/core/CL/OpenCL.h
PATHS ${ARMCOMPUTE_ROOT}/include
PATHS ${ARMCOMPUTE_ROOT}/applications/arm_compute
@@ -330,7 +330,7 @@ if(ARMCOMPUTENEON)
endif()
# ARM Compute OpenCL backend
-if(ARMCOMPUTECL)
+if(ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
# verify we have a valid flatbuffers include path
find_path(FLATBUFFERS_INCLUDE_PATH flatbuffers/flatbuffers.h
HINTS ${FLATBUFFERS_ROOT}/include /usr/local/include /usr/include)
@@ -354,15 +354,22 @@ if(ARMCOMPUTECL)
include_directories(SYSTEM ${OPENCL_INCLUDE})
- # Add preprocessor definition for ARM Compute OpenCL
- add_definitions(-DARMCOMPUTECL_ENABLED)
+ if(ARMCOMPUTECL)
+ # Add preprocessor definition for ARM Compute OpenCL
+ add_definitions(-DARMCOMPUTECL_ENABLED)
+ endif()
+
+ if(ARMCOMPUTEGPUFSA)
+ # Add preprocessor definition for ARM Compute OpenCL
+ add_definitions(-DARMCOMPUTEGPUFSA_ENABLED)
+ endif()
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DARM_COMPUTE_DEBUG_ENABLED")
endif()
# Used by both Arm Compute backends, but should be added
# to the search path after the system directories if necessary
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
find_path(HALF_INCLUDE half/half.hpp)
find_path(HALF_INCLUDE half/half.hpp
PATHS ${ARMCOMPUTE_ROOT}/include
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 42388bfbd7..cda87e89c2 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2017,2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017,2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
@@ -1582,6 +1582,18 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph,
ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
profiler->EnableProfiling(options.m_ProfilingEnabled);
+ // Some backends don't play well together. Check here before continuing.
+ {
+ std::set<BackendId> backendSet(backendPreferences.begin(), backendPreferences.end());
+ // GpuFsa cannot co-exist with GpuAcc.
+ if (backendSet.find("GpuFsa") != backendSet.end() &&
+ backendSet.find("GpuAcc") != backendSet.end())
+ {
+ throw InvalidArgumentException("The backends \"GpuAcc\" and \"GpuFsa\" cannot be specified "
+ "for the same optimized network.");
+ }
+ }
+
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Optimizer");
if (backendPreferences.empty())
{
diff --git a/src/backends/aclCommon/BaseMemoryManager.cpp b/src/backends/aclCommon/BaseMemoryManager.cpp
index c60a4a04ae..e70d7f851d 100644
--- a/src/backends/aclCommon/BaseMemoryManager.cpp
+++ b/src/backends/aclCommon/BaseMemoryManager.cpp
@@ -1,10 +1,10 @@
//
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "BaseMemoryManager.hpp"
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
#include "arm_compute/runtime/BlobLifetimeManager.h"
#include "arm_compute/runtime/PoolManager.h"
#include "arm_compute/runtime/OffsetLifetimeManager.h"
@@ -14,7 +14,7 @@
namespace armnn
{
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
BaseMemoryManager::BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc,
MemoryAffinity memoryAffinity)
{
@@ -104,4 +104,12 @@ ClMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryMana
}
#endif
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+std::shared_ptr<arm_compute::IMemoryGroup>
+GpuFsaMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+{
+ return std::make_shared<arm_compute::MemoryGroup>(memoryManager);
}
+#endif
+
+} \ No newline at end of file
diff --git a/src/backends/aclCommon/BaseMemoryManager.hpp b/src/backends/aclCommon/BaseMemoryManager.hpp
index af099f900a..c18c4830a0 100644
--- a/src/backends/aclCommon/BaseMemoryManager.hpp
+++ b/src/backends/aclCommon/BaseMemoryManager.hpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
@@ -7,17 +7,13 @@
#include <armnn/backends/IMemoryManager.hpp>
#include <armnn/backends/WorkloadFactory.hpp>
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
-#include <arm_compute/runtime/MemoryGroup.h>
-#endif
-
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
#include <arm_compute/runtime/IAllocator.h>
#include <arm_compute/runtime/IMemoryGroup.h>
#include <arm_compute/runtime/MemoryManagerOnDemand.h>
#endif
-#if defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
#include <arm_compute/runtime/CL/CLTensorAllocator.h>
#endif
@@ -39,7 +35,7 @@ public:
void Acquire() override;
void Release() override;
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc, MemoryAffinity memoryAffinity);
std::shared_ptr<arm_compute::MemoryManagerOnDemand>& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; }
@@ -98,4 +94,24 @@ protected:
};
#endif
-} //namespace armnn
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+class GpuFsaMemoryManager : public BaseMemoryManager
+{
+public:
+ GpuFsaMemoryManager() {}
+ virtual ~GpuFsaMemoryManager() {}
+
+ GpuFsaMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc)
+ : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer)
+ {
+ arm_compute::CLTensorAllocator::set_global_allocator(alloc.get());
+ m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr);
+ }
+
+protected:
+ std::shared_ptr<arm_compute::IMemoryGroup>
+ CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) override;
+};
+#endif
+
+} // namespace armnn
diff --git a/src/backends/aclCommon/common.cmake b/src/backends/aclCommon/common.cmake
index 89be236a7f..1ea14951a6 100644
--- a/src/backends/aclCommon/common.cmake
+++ b/src/backends/aclCommon/common.cmake
@@ -1,9 +1,9 @@
#
-# Copyright © 2017 Arm Ltd. All rights reserved.
+# Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/aclCommon)
list(APPEND armnnLibraries armnnAclCommon)
list(APPEND armnnUnitTestLibraries armnnAclCommonUnitTests)
diff --git a/src/backends/gpuFsa/CMakeLists.txt b/src/backends/gpuFsa/CMakeLists.txt
index f5ddb34854..635b25b2d5 100644
--- a/src/backends/gpuFsa/CMakeLists.txt
+++ b/src/backends/gpuFsa/CMakeLists.txt
@@ -1,24 +1,26 @@
#
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
-if(ARMNNGPUFSA)
+if(ARMCOMPUTEGPUFSA)
list(APPEND armnnGpuFsaBackend_sources
GpuFsaBackend.cpp
GpuFsaBackend.hpp
+ GpuFsaBackendContext.cpp
+ GpuFsaBackendContext.hpp
+ GpuFsaBackendDefaultAllocator.hpp
GpuFsaBackendId.hpp
- GpuFsaTensorHandle.hpp
- GpuFsaTensorHandle.cpp
+ GpuFsaContextControl.cpp
+ GpuFsaContextControl.hpp
GpuFsaLayerSupport.cpp
GpuFsaLayerSupport.hpp
- GpuFsaMemoryManager.hpp
- GpuFsaMemoryManager.cpp
GpuFsaRegistryInitializer.cpp
- GpuFsaWorkloadFactory.cpp
- GpuFsaWorkloadFactory.hpp
+ GpuFsaTensorHandle.hpp
GpuFsaTensorHandleFactory.cpp
GpuFsaTensorHandleFactory.hpp
+ GpuFsaWorkloadFactory.cpp
+ GpuFsaWorkloadFactory.hpp
)
add_subdirectory(workloads)
@@ -30,6 +32,8 @@ if(ARMNNGPUFSA)
else()
list(APPEND armnnGpuFsaBackend_sources
GpuFsaBackendId.hpp
+ GpuFsaLayerSupport.cpp
+ GpuFsaLayerSupport.hpp
)
endif()
diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp
index 9c2f4a0df6..ae7ff0c243 100644
--- a/src/backends/gpuFsa/GpuFsaBackend.cpp
+++ b/src/backends/gpuFsa/GpuFsaBackend.cpp
@@ -1,23 +1,24 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "GpuFsaBackend.hpp"
+#include "GpuFsaBackendContext.hpp"
+#include "GpuFsaBackendDefaultAllocator.hpp"
#include "GpuFsaBackendId.hpp"
-#include "GpuFsaWorkloadFactory.hpp"
#include "GpuFsaLayerSupport.hpp"
#include "GpuFsaTensorHandleFactory.hpp"
+#include "GpuFsaWorkloadFactory.hpp"
-#include <armnn/BackendRegistry.hpp>
#include <armnn/backends/IBackendContext.hpp>
#include <armnn/backends/IMemoryManager.hpp>
-#include <armnn/utility/PolymorphicDowncast.hpp>
-#include <backendsCommon/DefaultAllocator.hpp>
-#include <backendsCommon/SubgraphUtils.hpp>
-
#include <Optimizer.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+
namespace armnn
{
@@ -27,6 +28,15 @@ const BackendId& GpuFsaBackend::GetIdStatic()
return s_Id;
}
+IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
+{
+ if (m_UsingCustomAllocator)
+ {
+ return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+}
+
IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
{
@@ -34,74 +44,142 @@ IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
}
IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
- class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const
+ TensorHandleFactoryRegistry& registry) const
{
- auto memoryManager = std::make_shared<GpuFsaMemoryManager>();
-
- tensorHandleFactoryRegistry.RegisterMemoryManager(memoryManager);
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
- auto factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
- // Register copy and import factory pair
- tensorHandleFactoryRegistry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId());
- // Register the factory
- tensorHandleFactoryRegistry.RegisterFactory(std::move(factory));
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
}
-IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions&) const
+IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
+ TensorHandleFactoryRegistry& registry,
+ const ModelOptions& modelOptions,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags) const
{
- return IBackendContextPtr{};
+ IgnoreUnused(modelOptions);
+
+ // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+ if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+ if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
+
+ return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
}
-IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
- const IRuntime::CreationOptions&, IBackendProfilingPtr&)
+std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
{
- return IBackendProfilingContextPtr{};
+ return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
}
-IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
+void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
{
- return std::make_unique<GpuFsaMemoryManager>();
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
+
}
-IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
+void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags)
{
- static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
- return layerSupport;
+ // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+ if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+ if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
}
-OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
- const ModelOptions& modelOptions) const
+IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
{
- OptimizationViews optimizationViews(modelOptions);
- optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
-
- return optimizationViews;
+ return IBackendContextPtr{new GpuFsaBackendContext{options}};
}
-std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
+IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
+ const IRuntime::CreationOptions&, IBackendProfilingPtr&)
{
- return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
+ return IBackendProfilingContextPtr{};
}
-void GpuFsaBackend::RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry)
+IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
{
- auto memoryManager = std::make_shared<GpuFsaMemoryManager>();
-
- registry.RegisterMemoryManager(memoryManager);
-
- auto factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
-
- // Register copy and import factory pair
- registry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId());
- // Register the factory
- registry.RegisterFactory(std::move(factory));
+ static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
+ return layerSupport;
}
std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
{
- return std::make_unique<DefaultAllocator>();
+ return std::make_unique<GpuFsaBackendDefaultAllocator>();
+}
+
+OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
+ const ModelOptions& modelOptions) const
+{
+ OptimizationViews optimizationViews(modelOptions);
+ optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
+ return optimizationViews;
}
-} // namespace armnn \ No newline at end of file
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp
index 803c6a4c66..6d886a12b1 100644
--- a/src/backends/gpuFsa/GpuFsaBackend.hpp
+++ b/src/backends/gpuFsa/GpuFsaBackend.hpp
@@ -1,56 +1,287 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
#include <armnn/backends/IBackendInternal.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+
+// System includes for mapping and unmapping memory
+#include <sys/mman.h>
namespace armnn
{
+// add new capabilities here..
+const BackendCapabilities gpuFsaCapabilities("GpuFsa",
+ {
+ {"NonConstWeights", false},
+ {"AsyncExecution", false},
+ {"ProtectedContentAllocation", true},
+ {"ConstantTensorsAsInputs", true},
+ {"PreImportIOTensors", false},
+ {"ExternallyManagedMemory", true},
+ {"MultiAxisPacking", false},
+ {"SingleAxisPacking", true}
+ });
+
class GpuFsaBackend : public IBackendInternal
{
public:
- GpuFsaBackend() = default;
+ GpuFsaBackend() : m_CustomAllocator(nullptr) {};
+ GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)
+ {
+ std::string err;
+ UseCustomMemoryAllocator(allocator, err);
+ }
~GpuFsaBackend() = default;
static const BackendId& GetIdStatic();
- const BackendId& GetId() const override
- {
- return GetIdStatic();
- }
+ const BackendId& GetId() const override { return GetIdStatic(); }
IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;
IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
- const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
+ const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
- IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
- class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const override;
+ IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;
- IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+ IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
+ const ModelOptions& modelOptions,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags) const override;
+
+ std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+
+ void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;
- IBackendInternal::IBackendProfilingContextPtr
- CreateBackendProfilingContext(const IRuntime::CreationOptions& creationOptions,
- IBackendProfilingPtr& backendProfiling) override;
+ void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags) override;
+
+ IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+ IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(
+ const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;
IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;
OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,
const ModelOptions& modelOptions) const override;
- std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+ std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
- void RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry) override;
+ BackendCapabilities GetCapabilities() const override
+ {
+ return gpuFsaCapabilities;
+ };
- std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
+ virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+ armnn::Optional<std::string&> errMsg) override
+ {
+ IgnoreUnused(errMsg);
+ ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";
+
+ // Set flag to signal the backend to use a custom memory allocator
+ m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));
+ m_UsingCustomAllocator = true;
+ return m_UsingCustomAllocator;
+ }
+
+ // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
+ class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator
+ {
+ public:
+ GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
+ {}
+ // Inherited methods overridden:
+ void* allocate(size_t size, size_t alignment) override
+ {
+ auto alloc = m_CustomAllocator->allocate(size, alignment);
+ return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
+ }
+ void free(void* ptr) override
+ {
+ auto hostMemPtr = m_AllocatedBufferMappings[ptr];
+ clReleaseMemObject(static_cast<cl_mem>(ptr));
+ m_CustomAllocator->free(hostMemPtr);
+ }
+ std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
+ {
+ auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
+ cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
+
+ return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),
+ hostMemPtr,
+ m_CustomAllocator->GetMemorySourceType());
+ }
+ private:
+ cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
+ {
+ // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+ auto cachelineAlignment =
+ arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+ auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
+
+ if (source == MemorySource::Malloc)
+ {
+ const cl_import_properties_arm importProperties[] =
+ {
+ CL_IMPORT_TYPE_ARM,
+ CL_IMPORT_TYPE_HOST_ARM,
+ 0
+ };
+ cl_int error = CL_SUCCESS;
+ cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+ CL_MEM_READ_WRITE,
+ importProperties,
+ memory,
+ roundedSize,
+ &error);
+ if (error == CL_SUCCESS)
+ {
+ m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+ return buffer;
+ }
+ throw armnn::Exception(
+ "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
+ }
+ else if (source == MemorySource::DmaBuf)
+ {
+ const cl_import_properties_arm importProperties[] =
+ {
+ CL_IMPORT_TYPE_ARM,
+ CL_IMPORT_TYPE_DMA_BUF_ARM,
+ CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,
+ CL_TRUE,
+ 0
+ };
+ cl_int error = CL_SUCCESS;
+ cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+ CL_MEM_READ_WRITE,
+ importProperties,
+ memory,
+ roundedSize,
+ &error);
+ if (error == CL_SUCCESS)
+ {
+ m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+ return buffer;
+ }
+ throw armnn::Exception(
+ "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+ + std::to_string(error));
+ }
+ else if (source == MemorySource::DmaBufProtected)
+ {
+ const cl_import_properties_arm importProperties[] =
+ {
+ CL_IMPORT_TYPE_ARM,
+ CL_IMPORT_TYPE_DMA_BUF_ARM,
+ CL_IMPORT_TYPE_PROTECTED_ARM,
+ CL_TRUE,
+ 0
+ };
+ cl_int error = CL_SUCCESS;
+ cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+ CL_MEM_READ_WRITE,
+ importProperties,
+ memory,
+ roundedSize,
+ &error);
+ if (error == CL_SUCCESS)
+ {
+ m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+ return buffer;
+ }
+ throw armnn::Exception(
+ "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+ + std::to_string(error));
+ }
+ throw armnn::Exception(
+ "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
+ }
+ std::shared_ptr<ICustomAllocator> m_CustomAllocator;
+ std::map<void*, void*> m_AllocatedBufferMappings;
+ };
+
+ class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
+ {
+ public:
+ // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
+ ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)
+ : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+ {
+ _mem = buffer;
+ m_HostMemPtr = hostMemPtr;
+ m_MemorySource = source;
+ }
+
+ // Inherited methods overridden :
+ void* ptr() override
+ {
+ return nullptr;
+ }
+
+ void* map(cl::CommandQueue &q, bool blocking) override
+ {
+ armnn::IgnoreUnused(q, blocking);
+ if (m_HostMemPtr == nullptr)
+ {
+ throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
+ }
+ if (_mapping != nullptr)
+ {
+ throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");
+ }
+ switch (m_MemorySource)
+ {
+ case armnn::MemorySource::Malloc:
+ _mapping = m_HostMemPtr;
+ return _mapping;
+ break;
+ case armnn::MemorySource::DmaBuf:
+ case armnn::MemorySource::DmaBufProtected:
+ // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd
+ _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);
+ return _mapping;
+ break;
+ default:
+ throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");
+ break;
+ }
+ }
-private:
- // Private members
+ void unmap(cl::CommandQueue &q) override
+ {
+ armnn::IgnoreUnused(q);
+ switch (m_MemorySource)
+ {
+ case armnn::MemorySource::Malloc:
+ _mapping = nullptr;
+ break;
+ case armnn::MemorySource::DmaBuf:
+ case armnn::MemorySource::DmaBufProtected:
+ munmap(_mapping, _size);
+ _mapping = nullptr;
+ break;
+ default:
+ throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");
+ break;
+ }
+ }
+ private:
+ void* m_HostMemPtr = nullptr;
+ armnn::MemorySource m_MemorySource;
+ };
-protected:
- // Protected members
+ std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;
+ bool m_UsingCustomAllocator = false;
};
} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.cpp b/src/backends/gpuFsa/GpuFsaBackendContext.cpp
new file mode 100644
index 0000000000..72b77e0d19
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendContext.cpp
@@ -0,0 +1,230 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaBackendContext.hpp"
+#include "GpuFsaContextControl.hpp"
+
+#include <armnn/utility/Assert.hpp>
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+namespace armnn
+{
+
+struct GpuFsaBackendContext::GpuFsaContextControlWrapper
+{
+ GpuFsaContextControlWrapper(arm_compute::CLTuner* tuner,
+ arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle,
+ bool profilingEnabled)
+ : m_GpuFsaContextControl(tuner, heuristicsHandle, profilingEnabled)
+ {}
+
+ bool Sync()
+ {
+ if (arm_compute::CLScheduler::get().context()() != NULL)
+ {
+ // Waits for all queued CL requests to finish before unloading the network they may be using.
+ try
+ {
+ // Coverity fix: arm_compute::CLScheduler::sync() may throw an exception of type cl::Error.
+ arm_compute::CLScheduler::get().sync();
+ }
+ catch (const cl::Error&)
+ {
+ ARMNN_LOG(warning) << "Runtime::UnloadNetwork(): an error occurred while waiting for "
+ "the queued CL requests to finish";
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ void ClearClCache()
+ {
+ if (arm_compute::CLScheduler::get().context()() != NULL)
+ {
+ // There are no loaded networks left, so clear the CL cache to free up memory
+ m_GpuFsaContextControl.ClearClCache();
+ }
+ }
+
+ GpuFsaContextControl m_GpuFsaContextControl;
+};
+
+GpuFsaBackendContext::GpuFsaBackendContext(const IRuntime::CreationOptions& options)
+ : IBackendContext(options)
+ , m_TuningFile()
+{
+ bool kernelProfiling = options.m_EnableGpuProfiling;
+
+ arm_compute::CLTuner* tuner = nullptr;
+ arm_compute::CLGEMMHeuristicsHandle* mlgoTuner = nullptr;
+ bool useLegacyTunerAPI = options.m_GpuAccTunedParameters.get() != nullptr;
+ if (useLegacyTunerAPI)
+ {
+ auto clTunerParams = PolymorphicDowncast<ClTunedParameters*>(
+ options.m_GpuAccTunedParameters.get());
+ tuner = &clTunerParams->m_Tuner;
+
+ if (tuner)
+ {
+ auto ConvertTuningLevel = [](IGpuAccTunedParameters::TuningLevel level,
+ armnn::IGpuAccTunedParameters::Mode mode)
+ {
+ if (mode == armnn::IGpuAccTunedParameters::Mode::UseTunedParameters)
+ {
+ return TuningLevel::None;
+ }
+
+ switch(level)
+ {
+ case IGpuAccTunedParameters::TuningLevel::Rapid:
+ return TuningLevel::Rapid;
+ case IGpuAccTunedParameters::TuningLevel::Normal:
+ return TuningLevel::Normal;
+ case IGpuAccTunedParameters::TuningLevel::Exhaustive:
+ return TuningLevel::Exhaustive;
+ default:
+ {
+ ARMNN_ASSERT_MSG(false, "Tuning level not recognised.");
+ return TuningLevel::None;
+ }
+ }
+ };
+
+ TuningLevel tuningLevel = ConvertTuningLevel(clTunerParams->m_TuningLevel, clTunerParams->m_Mode);
+ ConfigureTuner(*tuner, tuningLevel);
+ }
+ }
+ else //New backend options API
+ {
+ const TuningLevel defaultTuningLevel = TuningLevel::None;
+ auto tuningLevel = defaultTuningLevel;
+
+ ParseOptions(options.m_BackendOptions, "GpuFsa", [&](std::string name, const BackendOptions::Var& value)
+ {
+ if (name == "KernelProfilingEnabled")
+ {
+ kernelProfiling |= ParseBooleanBackendOption(value, false);
+ } else if (name == "TuningFile")
+ {
+ m_TuningFile = ParseStringBackendOption(value, "");
+ } else if (name == "TuningLevel")
+ {
+ tuningLevel = ParseTuningLevel(value, defaultTuningLevel);
+ }
+ else if (name == "MLGOTuningFilePath")
+ {
+ m_MLGOTuningFile = ParseStringBackendOption(value, "");
+ }
+ });
+
+ // Create the tuner, in tuning mode initially.
+ m_Tuner = std::make_unique<arm_compute::CLTuner>(true);
+
+ ConfigureTuner(*(m_Tuner.get()), tuningLevel);
+
+ if (!m_TuningFile.empty())
+ {
+ try
+ {
+ ARMNN_LOG(info) << "Loading Gpu tuning data from file: " << m_TuningFile;
+ m_Tuner->load_from_file(m_TuningFile.c_str());
+ }
+ catch (const std::exception& e)
+ {
+ // Warn if not tuning, otherwise tuning will generate new params
+ if (tuningLevel == TuningLevel::None)
+ {
+ ARMNN_LOG(warning) << "Could not load GpuFsa tuner data file.";
+ }
+ }
+ }
+
+ if (!m_MLGOTuningFile.empty())
+ {
+ try
+ {
+ ARMNN_LOG(info) << "Loading Gpu MLGO tuning data from file: " << m_TuningFile;
+ if(m_MLGOTuner.reload_from_file(m_MLGOTuningFile.c_str()))
+ {
+ mlgoTuner = &m_MLGOTuner;
+ }
+ }
+ catch (const std::exception& e)
+ {
+ ARMNN_LOG(warning) << "Could not load GpuFsa MLGO tuner data file.";
+ }
+ }
+
+ tuner = m_Tuner.get();
+ }
+
+ m_GpuFsaContextControlWrapper = std::make_unique<GpuFsaContextControlWrapper>(
+ tuner,
+ mlgoTuner,
+ kernelProfiling
+ );
+}
+
+bool GpuFsaBackendContext::BeforeLoadNetwork(NetworkId)
+{
+ return true;
+}
+
+bool GpuFsaBackendContext::AfterLoadNetwork(NetworkId networkId)
+{
+ {
+ std::lock_guard<std::mutex> lockGuard(m_Mutex);
+ m_NetworkIds.insert(networkId);
+ }
+ return true;
+}
+
+bool GpuFsaBackendContext::BeforeUnloadNetwork(NetworkId)
+{
+ return m_GpuFsaContextControlWrapper->Sync();
+}
+
+bool GpuFsaBackendContext::AfterUnloadNetwork(NetworkId networkId)
+{
+ bool clearCache = false;
+ {
+ std::lock_guard<std::mutex> lockGuard(m_Mutex);
+ m_NetworkIds.erase(networkId);
+ clearCache = m_NetworkIds.empty();
+ }
+
+ if (clearCache)
+ {
+ m_GpuFsaContextControlWrapper->ClearClCache();
+ }
+
+ return true;
+}
+
+bool GpuFsaBackendContext::AfterEnqueueWorkload(NetworkId)
+{
+ return m_GpuFsaContextControlWrapper->Sync();
+}
+
+GpuFsaBackendContext::~GpuFsaBackendContext()
+{
+ if (m_Tuner && !m_TuningFile.empty())
+ {
+ try
+ {
+ m_Tuner->save_to_file(m_TuningFile.c_str());
+ }
+ catch(const std::exception& e)
+ {
+ ARMNN_LOG(warning) << "Could not save GpuFsa tuner data to file " << m_TuningFile;
+ }
+ }
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.hpp b/src/backends/gpuFsa/GpuFsaBackendContext.hpp
new file mode 100644
index 0000000000..271688fd99
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendContext.hpp
@@ -0,0 +1,47 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/backends/IBackendContext.hpp>
+#include <unordered_set>
+#include <mutex>
+
+#include <arm_compute/runtime/CL/CLTuner.h>
+#include <arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h>
+
+namespace armnn
+{
+
+class GpuFsaBackendContext : public IBackendContext
+{
+public:
+ GpuFsaBackendContext(const IRuntime::CreationOptions& options);
+
+ bool BeforeLoadNetwork(NetworkId networkId) override;
+ bool AfterLoadNetwork(NetworkId networkId) override;
+
+ bool BeforeUnloadNetwork(NetworkId networkId) override;
+ bool AfterUnloadNetwork(NetworkId networkId) override;
+
+ bool AfterEnqueueWorkload(NetworkId networkId) override;
+
+ ~GpuFsaBackendContext() override;
+
+private:
+ std::mutex m_Mutex;
+ struct GpuFsaContextControlWrapper;
+ std::unique_ptr<GpuFsaContextControlWrapper> m_GpuFsaContextControlWrapper;
+
+ std::unordered_set<NetworkId> m_NetworkIds;
+
+ std::unique_ptr<arm_compute::CLTuner> m_Tuner;
+ std::string m_TuningFile;
+
+protected:
+ arm_compute::CLGEMMHeuristicsHandle m_MLGOTuner;
+ std::string m_MLGOTuningFile;
+};
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp
new file mode 100644
index 0000000000..c57ff63b92
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp
@@ -0,0 +1,51 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <memory>
+
+#include <armnn/MemorySources.hpp>
+#include <armnn/utility/IgnoreUnused.hpp>
+
+namespace armnn
+{
+
+/**
+* Default Memory Allocator class returned from IBackendInternal::GetDefaultAllocator(MemorySource)
+*/
+class GpuFsaBackendDefaultAllocator : public ICustomAllocator
+{
+public:
+ GpuFsaBackendDefaultAllocator() = default;
+
+ void* allocate(size_t size, size_t alignment = 0) override
+ {
+ IgnoreUnused(alignment);
+ cl_mem buf{ clCreateBuffer(arm_compute::CLScheduler::get().context().get(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ size,
+ nullptr,
+ nullptr)};
+ return static_cast<void *>(buf);
+ }
+
+ void free(void* ptr) override
+ {
+ ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+ clReleaseMemObject(static_cast<cl_mem>(ptr));
+ }
+
+ MemorySource GetMemorySourceType() override
+ {
+ return MemorySource::Gralloc;
+ }
+
+ void* GetMemoryRegionAtOffset(void* buffer, size_t offset, size_t alignment = 0) override
+ {
+ IgnoreUnused(alignment);
+ return static_cast<char*>(buffer) + offset;
+ }
+};
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaContextControl.cpp b/src/backends/gpuFsa/GpuFsaContextControl.cpp
new file mode 100644
index 0000000000..795de5e14d
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaContextControl.cpp
@@ -0,0 +1,163 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaContextControl.hpp"
+
+#include <armnn/Exceptions.hpp>
+#include <armnn/utility/Assert.hpp>
+#include <LeakChecking.hpp>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <fmt/format.h>
+
+namespace cl
+{
+class Context;
+class CommandQueue;
+class Device;
+}
+
+namespace armnn
+{
+
+GpuFsaContextControl::GpuFsaContextControl(arm_compute::CLTuner *tuner,
+ arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle,
+ bool profilingEnabled)
+ : m_Tuner(tuner)
+ , m_HeuristicsHandle(heuristicsHandle)
+ , m_ProfilingEnabled(profilingEnabled)
+{
+ try
+ {
+ std::vector<cl::Platform> platforms;
+ cl::Platform::get(&platforms);
+
+ // Selects default platform for the first element.
+ cl::Platform::setDefault(platforms[0]);
+
+ std::vector<cl::Device> devices;
+ platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
+
+ // Selects default device for the first element.
+ cl::Device::setDefault(devices[0]);
+ }
+ catch (const cl::Error& clError)
+ {
+ throw ClRuntimeUnavailableException(fmt::format(
+ "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}",
+ clError.what(), clError.err()));
+ }
+
+ // Removes the use of global CL context.
+ cl::Context::setDefault(cl::Context{});
+ ARMNN_ASSERT(cl::Context::getDefault()() == NULL);
+
+ // Removes the use of global CL command queue.
+ cl::CommandQueue::setDefault(cl::CommandQueue{});
+ ARMNN_ASSERT(cl::CommandQueue::getDefault()() == NULL);
+
+ // Always load the OpenCL runtime.
+ LoadOpenClRuntime();
+}
+
+GpuFsaContextControl::~GpuFsaContextControl()
+{
+ // Load the OpencCL runtime without the tuned parameters to free the memory for them.
+ try
+ {
+ UnloadOpenClRuntime();
+ }
+ catch (const cl::Error& clError)
+ {
+ // This should not happen, it is ignored if it does.
+
+ // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an
+ // exception of type std::length_error.
+ // Using stderr instead in this context as there is no point in nesting try-catch blocks here.
+ std::cerr << "A CL error occurred unloading the runtime tuner parameters: "
+ << clError.what() << ". CL error code is: " << clError.err() << std::endl;
+ }
+}
+
+void GpuFsaContextControl::LoadOpenClRuntime()
+{
+ DoLoadOpenClRuntime(true);
+}
+
+void GpuFsaContextControl::UnloadOpenClRuntime()
+{
+ DoLoadOpenClRuntime(false);
+}
+
+void GpuFsaContextControl::DoLoadOpenClRuntime(bool updateTunedParameters)
+{
+ cl::Device device = cl::Device::getDefault();
+ cl::Context context;
+ cl::CommandQueue commandQueue;
+
+ if (arm_compute::CLScheduler::get().is_initialised() && arm_compute::CLScheduler::get().context()() != NULL)
+ {
+ // Wait for all queued CL requests to finish before reinitialising it.
+ arm_compute::CLScheduler::get().sync();
+ }
+
+ try
+ {
+ arm_compute::CLKernelLibrary::get().clear_programs_cache();
+ // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no
+ // context references); it is initialised again, with a proper context, later.
+ arm_compute::CLScheduler::get().init(context, commandQueue, device);
+ arm_compute::CLKernelLibrary::get().init(".", context, device);
+
+ {
+ //
+ // Here we replace the context with a new one in which
+ // the memory leak checks show it as an extra allocation but
+ // because of the scope of the leak checks, it doesn't count
+ // the disposal of the original object. On the other hand it
+ // does count the creation of this context which it flags
+ // as a memory leak. By adding the following line we prevent
+ // this to happen.
+ //
+ ARMNN_DISABLE_LEAK_CHECKING_IN_SCOPE();
+ context = cl::Context(device);
+ }
+
+ // NOTE: In this specific case profiling has to be enabled on the command queue
+ // in order for the CLTuner to work.
+ bool profilingNeededForClTuner = updateTunedParameters && m_Tuner &&
+ m_Tuner->tune_new_kernels();
+
+ if (m_ProfilingEnabled || profilingNeededForClTuner)
+ {
+ // Create a new queue with profiling enabled.
+ commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
+ }
+ else
+ {
+ // Use default queue.
+ commandQueue = cl::CommandQueue(context, device);
+ }
+ }
+ catch (const cl::Error& clError)
+ {
+ throw ClRuntimeUnavailableException(fmt::format(
+ "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}",
+ clError.what(), clError.err()));
+ }
+
+ // Note the first argument (path to cl source code) will be ignored as they should be embedded in the armcompute.
+ arm_compute::CLKernelLibrary::get().init(".", context, device);
+ arm_compute::CLScheduler::get().init(context, commandQueue, device, m_Tuner, m_HeuristicsHandle);
+}
+
+void GpuFsaContextControl::ClearClCache()
+{
+ DoLoadOpenClRuntime(true);
+}
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaContextControl.hpp b/src/backends/gpuFsa/GpuFsaContextControl.hpp
new file mode 100644
index 0000000000..f77b1fbdd4
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaContextControl.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <aclCommon/ArmComputeTuningUtils.hpp>
+
+namespace armnn
+{
+
+// ARM Compute OpenCL context control.
+class GpuFsaContextControl
+{
+public:
+
+ GpuFsaContextControl(arm_compute::CLTuner* = nullptr,
+ arm_compute::CLGEMMHeuristicsHandle* = nullptr,
+ bool profilingEnabled = false);
+
+ virtual ~GpuFsaContextControl();
+
+ void LoadOpenClRuntime();
+
+ // Users should call this (after freeing all of the cl::Context objects they use)
+ // to release the cached memory used by the compute library.
+ void UnloadOpenClRuntime();
+
+ // Clear the CL cache, without losing the tuned parameter settings.
+ void ClearClCache();
+
+private:
+
+ void DoLoadOpenClRuntime(bool updateTunedParameters);
+
+ arm_compute::CLTuner* m_Tuner;
+ arm_compute::CLGEMMHeuristicsHandle* m_HeuristicsHandle;
+
+ bool m_ProfilingEnabled;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp b/src/backends/gpuFsa/GpuFsaMemoryManager.cpp
deleted file mode 100644
index 4eefb87d88..0000000000
--- a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#include "GpuFsaMemoryManager.hpp"
-
-#include <armnn/utility/Assert.hpp>
-
-#include <algorithm>
-
-namespace armnn
-{
-
-GpuFsaMemoryManager::GpuFsaMemoryManager()
-{}
-
-GpuFsaMemoryManager::~GpuFsaMemoryManager()
-{}
-
-GpuFsaMemoryManager::Pool* GpuFsaMemoryManager::Manage(unsigned int numBytes)
-{
- if (!m_FreePools.empty())
- {
- Pool* res = m_FreePools.back();
- m_FreePools.pop_back();
- res->Reserve(numBytes);
- return res;
- }
- else
- {
- m_Pools.push_front(Pool(numBytes));
- return &m_Pools.front();
- }
-}
-
-void GpuFsaMemoryManager::Allocate(GpuFsaMemoryManager::Pool* pool)
-{
- ARMNN_ASSERT(pool);
- m_FreePools.push_back(pool);
-}
-
-void* GpuFsaMemoryManager::GetPointer(GpuFsaMemoryManager::Pool* pool)
-{
- return pool->GetPointer();
-}
-
-void GpuFsaMemoryManager::Acquire()
-{
- for (Pool &pool: m_Pools)
- {
- pool.Acquire();
- }
-}
-
-void GpuFsaMemoryManager::Release()
-{
- for (Pool &pool: m_Pools)
- {
- pool.Release();
- }
-}
-
-GpuFsaMemoryManager::Pool::Pool(unsigned int numBytes)
- : m_Size(numBytes),
- m_Pointer(nullptr)
-{}
-
-GpuFsaMemoryManager::Pool::~Pool()
-{
- if (m_Pointer)
- {
- Release();
- }
-}
-
-void* GpuFsaMemoryManager::Pool::GetPointer()
-{
- ARMNN_ASSERT_MSG(m_Pointer, "GpuFsaMemoryManager::Pool::GetPointer() called when memory not acquired");
- return m_Pointer;
-}
-
-void GpuFsaMemoryManager::Pool::Reserve(unsigned int numBytes)
-{
- ARMNN_ASSERT_MSG(!m_Pointer, "GpuFsaMemoryManager::Pool::Reserve() cannot be called after memory acquired");
- m_Size = std::max(m_Size, numBytes);
-}
-
-void GpuFsaMemoryManager::Pool::Acquire()
-{
- ARMNN_ASSERT_MSG(!m_Pointer, "GpuFsaMemoryManager::Pool::Acquire() called when memory already acquired");
- m_Pointer = ::operator new(size_t(m_Size));
-}
-
-void GpuFsaMemoryManager::Pool::Release()
-{
- ARMNN_ASSERT_MSG(m_Pointer, "GpuFsaMemoryManager::Pool::Release() called when memory not acquired");
- ::operator delete(m_Pointer);
- m_Pointer = nullptr;
-}
-
-} \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp b/src/backends/gpuFsa/GpuFsaMemoryManager.hpp
deleted file mode 100644
index 636b839a51..0000000000
--- a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#pragma once
-
-#include <armnn/backends/IMemoryManager.hpp>
-
-#include <forward_list>
-#include <vector>
-
-namespace armnn
-{
-
-// A dummy MemoryManager which will be deleted once the GpuFsa Backend is integrated with ClMemoryManager
-class GpuFsaMemoryManager : public IMemoryManager
-{
-public:
- GpuFsaMemoryManager();
- virtual ~GpuFsaMemoryManager();
-
- class Pool;
-
- Pool* Manage(unsigned int numBytes);
-
- void Allocate(Pool *pool);
-
- void* GetPointer(Pool *pool);
-
- void Acquire() override;
- void Release() override;
-
- class Pool
- {
- public:
- Pool(unsigned int numBytes);
- ~Pool();
-
- void Acquire();
- void Release();
-
- void* GetPointer();
-
- void Reserve(unsigned int numBytes);
-
- private:
- unsigned int m_Size;
- void* m_Pointer;
- };
-
-private:
- GpuFsaMemoryManager(const GpuFsaMemoryManager&) = delete; // Noncopyable
- GpuFsaMemoryManager& operator=(const GpuFsaMemoryManager&) = delete; // Noncopyable
-
- std::forward_list<Pool> m_Pools;
- std::vector<Pool*> m_FreePools;
-};
-
-}
diff --git a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
index 875b7d7112..9efb300576 100644
--- a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
+++ b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
@@ -1,9 +1,11 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
+
#include "GpuFsaBackend.hpp"
#include <armnn/BackendRegistry.hpp>
+
namespace
{
using namespace armnn;
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp b/src/backends/gpuFsa/GpuFsaTensorHandle.cpp
deleted file mode 100644
index e806be49bb..0000000000
--- a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#include "GpuFsaTensorHandle.hpp"
-
-namespace armnn
-{
-GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo,
- std::shared_ptr<GpuFsaMemoryManager>& memoryManager)
- : m_TensorInfo(tensorInfo)
- , m_MemoryManager(memoryManager)
- , m_Pool(nullptr)
- , m_UnmanagedMemory(nullptr)
- , m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Undefined))
- , m_Imported(false)
- , m_IsImportEnabled(false)
-{}
-
-GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo,
- MemorySourceFlags importFlags)
- : m_TensorInfo(tensorInfo)
- , m_Pool(nullptr)
- , m_UnmanagedMemory(nullptr)
- , m_ImportFlags(importFlags)
- , m_Imported(false)
- , m_IsImportEnabled(true)
-{}
-
-GpuFsaTensorHandle::~GpuFsaTensorHandle()
-{
- if (!m_Pool)
- {
- // unmanaged
- if (!m_Imported)
- {
- ::operator delete(m_UnmanagedMemory);
- }
- }
-}
-
-void GpuFsaTensorHandle::Manage()
-{
- if (!m_IsImportEnabled)
- {
- ARMNN_ASSERT_MSG(!m_Pool, "GpuFsaTensorHandle::Manage() called twice");
- ARMNN_ASSERT_MSG(!m_UnmanagedMemory, "GpuFsaTensorHandle::Manage() called after Allocate()");
-
- m_Pool = m_MemoryManager->Manage(m_TensorInfo.GetNumBytes());
- }
-}
-
-void GpuFsaTensorHandle::Allocate()
-{
- // If import is enabled, do not allocate the tensor
- if (!m_IsImportEnabled)
- {
-
- if (!m_UnmanagedMemory)
- {
- if (!m_Pool)
- {
- // unmanaged
- m_UnmanagedMemory = ::operator new(m_TensorInfo.GetNumBytes());
- }
- else
- {
- m_MemoryManager->Allocate(m_Pool);
- }
- }
- else
- {
- throw InvalidArgumentException("GpuFsaTensorHandle::Allocate Trying to allocate a GpuFsaTensorHandle"
- "that already has allocated memory.");
- }
- }
-}
-
-const void* GpuFsaTensorHandle::Map(bool /*unused*/) const
-{
- return GetPointer();
-}
-
-void* GpuFsaTensorHandle::GetPointer() const
-{
- if (m_UnmanagedMemory)
- {
- return m_UnmanagedMemory;
- }
- else if (m_Pool)
- {
- return m_MemoryManager->GetPointer(m_Pool);
- }
- else
- {
- throw NullPointerException("GpuFsaTensorHandle::GetPointer called on unmanaged, unallocated tensor handle");
- }
-}
-
-void GpuFsaTensorHandle::CopyOutTo(void* dest) const
-{
- const void *src = GetPointer();
- ARMNN_ASSERT(src);
- memcpy(dest, src, m_TensorInfo.GetNumBytes());
-}
-
-void GpuFsaTensorHandle::CopyInFrom(const void* src)
-{
- void *dest = GetPointer();
- ARMNN_ASSERT(dest);
- memcpy(dest, src, m_TensorInfo.GetNumBytes());
-}
-
-bool GpuFsaTensorHandle::Import(void* memory, MemorySource source)
-{
- if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
- {
- if (m_IsImportEnabled && source == MemorySource::Malloc)
- {
- // Check memory alignment
- if(!CanBeImported(memory, source))
- {
- if (m_Imported)
- {
- m_Imported = false;
- m_UnmanagedMemory = nullptr;
- }
- return false;
- }
-
- // m_UnmanagedMemory not yet allocated.
- if (!m_Imported && !m_UnmanagedMemory)
- {
- m_UnmanagedMemory = memory;
- m_Imported = true;
- return true;
- }
-
- // m_UnmanagedMemory initially allocated with Allocate().
- if (!m_Imported && m_UnmanagedMemory)
- {
- return false;
- }
-
- // m_UnmanagedMemory previously imported.
- if (m_Imported)
- {
- m_UnmanagedMemory = memory;
- return true;
- }
- }
- }
-
- return false;
-}
-
-bool GpuFsaTensorHandle::CanBeImported(void* memory, MemorySource source)
-{
- if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
- {
- if (m_IsImportEnabled && source == MemorySource::Malloc)
- {
- uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType());
- if (reinterpret_cast<uintptr_t>(memory) % alignment)
- {
- return false;
- }
- return true;
- }
- }
- return false;
-}
-
-
-
-} \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
index b2da50a467..d6901d1225 100644
--- a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
+++ b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
@@ -1,83 +1,361 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
-#include <armnn/backends/TensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
-#include "GpuFsaMemoryManager.hpp"
+#include <armnn/utility/PolymorphicDowncast.hpp>
+#include <Half.hpp>
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/IMemoryGroup.h>
+#include <arm_compute/runtime/MemoryGroup.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Coordinates.h>
+
+#include <aclCommon/IClTensorHandle.hpp>
namespace armnn
{
-// An implementation of ITensorHandle with simple "bump the pointer" memory-management behaviour
-// Will be refactored to look more like ClTensorHandle.hpp and use ClMemoryManager instead of GpuFsaMemoryManager
-class GpuFsaTensorHandle : public ITensorHandle
+class GpuFsaTensorHandle : public IClTensorHandle
{
public:
- GpuFsaTensorHandle(const TensorInfo& tensorInfo, std::shared_ptr<GpuFsaMemoryManager>& memoryManager);
+ GpuFsaTensorHandle(const TensorInfo& tensorInfo)
+ : m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Undefined)),
+ m_Imported(false),
+ m_IsImportEnabled(false)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo);
+ }
- GpuFsaTensorHandle(const TensorInfo& tensorInfo, MemorySourceFlags importFlags);
+ GpuFsaTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout,
+ MemorySourceFlags importFlags = static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ : m_ImportFlags(importFlags),
+ m_Imported(false),
+ m_IsImportEnabled(false)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout);
+ }
- ~GpuFsaTensorHandle();
+ arm_compute::CLTensor& GetTensor() override { return m_Tensor; }
+ arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; }
+ virtual void Allocate() override
+ {
+ // If we have enabled Importing, don't allocate the tensor
+ if (m_IsImportEnabled)
+ {
+ throw MemoryImportException("GpuFsaTensorHandle::Attempting to allocate memory when importing");
+ }
+ else
+ {
+ armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);
+ }
- virtual void Manage() override;
+ }
- virtual void Allocate() override;
+ virtual void Manage() override
+ {
+ // If we have enabled Importing, don't manage the tensor
+ if (m_IsImportEnabled)
+ {
+ throw MemoryImportException("GpuFsaTensorHandle::Attempting to manage memory when importing");
+ }
+ else
+ {
+ assert(m_MemoryGroup != nullptr);
+ m_MemoryGroup->manage(&m_Tensor);
+ }
+ }
- virtual ITensorHandle* GetParent() const override
+ virtual const void* Map(bool blocking = true) const override
{
- return nullptr;
+ const_cast<arm_compute::CLTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
}
- virtual const void* Map(bool /* blocking = true */) const override;
- using ITensorHandle::Map;
+ virtual void Unmap() const override { const_cast<arm_compute::CLTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle* GetParent() const override { return nullptr; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
- virtual void Unmap() const override
- {}
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override
+ {
+ m_MemoryGroup = PolymorphicPointerDowncast<arm_compute::MemoryGroup>(memoryGroup);
+ }
TensorShape GetStrides() const override
{
- return GetUnpaddedTensorStrides(m_TensorInfo);
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
}
TensorShape GetShape() const override
{
- return m_TensorInfo.GetShape();
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
}
- const TensorInfo& GetTensorInfo() const
+ void SetImportFlags(MemorySourceFlags importFlags)
{
- return m_TensorInfo;
+ m_ImportFlags = importFlags;
}
- virtual MemorySourceFlags GetImportFlags() const override
+ MemorySourceFlags GetImportFlags() const override
{
return m_ImportFlags;
}
- virtual bool Import(void* memory, MemorySource source) override;
- virtual bool CanBeImported(void* memory, MemorySource source) override;
+ void SetImportEnabledFlag(bool importEnabledFlag)
+ {
+ m_IsImportEnabled = importEnabledFlag;
+ }
-private:
- // Only used for testing
- void CopyOutTo(void*) const override;
- void CopyInFrom(const void*) override;
+ virtual bool Import(void* /*memory*/, MemorySource source) override
+ {
+ if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
+ {
+ throw MemoryImportException("GpuFsaTensorHandle::Incorrect import flag");
+ }
+ m_Imported = false;
+ return false;
+ }
- void* GetPointer() const;
+ virtual bool CanBeImported(void* /*memory*/, MemorySource /*source*/) override
+ {
+ // This TensorHandle can never import.
+ return false;
+ }
- GpuFsaTensorHandle(const GpuFsaTensorHandle& other) = delete; // noncopyable
- GpuFsaTensorHandle& operator=(const GpuFsaTensorHandle& other) = delete; //noncopyable
+private:
+ // Only used for testing
+ void CopyOutTo(void* memory) const override
+ {
+ const_cast<armnn::GpuFsaTensorHandle*>(this)->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<float*>(memory));
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<uint8_t*>(memory));
+ break;
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int8_t*>(memory));
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<armnn::Half*>(memory));
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int16_t*>(memory));
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int32_t*>(memory));
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ const_cast<armnn::GpuFsaTensorHandle*>(this)->Unmap();
+ }
- TensorInfo m_TensorInfo;
+ // Only used for testing
+ void CopyInFrom(const void* memory) override
+ {
+ this->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const float*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+ this->GetTensor());
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ this->Unmap();
+ }
- std::shared_ptr<GpuFsaMemoryManager> m_MemoryManager;
- GpuFsaMemoryManager::Pool* m_Pool;
- mutable void* m_UnmanagedMemory;
+ arm_compute::CLTensor m_Tensor;
+ std::shared_ptr<arm_compute::MemoryGroup> m_MemoryGroup;
MemorySourceFlags m_ImportFlags;
bool m_Imported;
bool m_IsImportEnabled;
};
-} \ No newline at end of file
+class GpuFsaSubTensorHandle : public IClTensorHandle
+{
+public:
+ GpuFsaSubTensorHandle(IClTensorHandle* parent,
+ const arm_compute::TensorShape& shape,
+ const arm_compute::Coordinates& coords)
+ : m_Tensor(&parent->GetTensor(), shape, coords)
+ {
+ parentHandle = parent;
+ }
+
+ arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; }
+ arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; }
+
+ virtual void Allocate() override {}
+ virtual void Manage() override {}
+
+ virtual const void* Map(bool blocking = true) const override
+ {
+ const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override { const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle* GetParent() const override { return parentHandle; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
+
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+
+private:
+ // Only used for testing
+ void CopyOutTo(void* memory) const override
+ {
+ const_cast<GpuFsaSubTensorHandle*>(this)->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<float*>(memory));
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<uint8_t*>(memory));
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<armnn::Half*>(memory));
+ break;
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int8_t*>(memory));
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int16_t*>(memory));
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int32_t*>(memory));
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ const_cast<GpuFsaSubTensorHandle*>(this)->Unmap();
+ }
+
+ // Only used for testing
+ void CopyInFrom(const void* memory) override
+ {
+ this->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const float*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+ this->GetTensor());
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ this->Unmap();
+ }
+
+ mutable arm_compute::CLSubTensor m_Tensor;
+ ITensorHandle* parentHandle = nullptr;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
index cd9d8cd64d..c1a34d24e5 100644
--- a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
+++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
@@ -1,32 +1,50 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "GpuFsaTensorHandle.hpp"
#include "GpuFsaTensorHandleFactory.hpp"
-#include "armnn/Logging.hpp"
-#include <armnn/utility/IgnoreUnused.hpp>
-
namespace armnn
{
using FactoryId = ITensorHandleFactory::FactoryId;
-const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic()
-{
- static const FactoryId s_Id(GpuFsaTensorHandleFactoryId());
- return s_Id;
-}
-
std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateSubTensorHandle(ITensorHandle& parent,
- const TensorShape& subTensorShape,
- const unsigned int* subTensorOrigin)
- const
+ const TensorShape& subTensorShape,
+ const unsigned int* subTensorOrigin) const
{
- IgnoreUnused(parent, subTensorShape, subTensorOrigin);
- return nullptr;
+ arm_compute::Coordinates coords;
+ arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape);
+
+ coords.set_num_dimensions(subTensorShape.GetNumDimensions());
+ for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); ++i)
+ {
+ // Arm compute indexes tensor coords in reverse order.
+ unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1;
+ coords.set(i, armnn::numeric_cast<int>(subTensorOrigin[revertedIndex]));
+ }
+
+ const arm_compute::TensorShape parentShape = armcomputetensorutils::BuildArmComputeTensorShape(parent.GetShape());
+
+ // In order for ACL to support subtensors the concat axis cannot be on x or y and the values of x and y
+ // must match the parent shapes
+ if (coords.x() != 0 || coords.y() != 0)
+ {
+ return nullptr;
+ }
+ if ((parentShape.x() != shape.x()) || (parentShape.y() != shape.y()))
+ {
+ return nullptr;
+ }
+
+ if (!::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, parentShape, coords, shape))
+ {
+ return nullptr;
+ }
+
+ return std::make_unique<GpuFsaSubTensorHandle>(PolymorphicDowncast<IClTensorHandle*>(&parent), shape, coords);
}
std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
@@ -43,25 +61,32 @@ std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(con
std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
const bool IsMemoryManaged) const
{
- std::unique_ptr<GpuFsaTensorHandle> handle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo);
if (!IsMemoryManaged)
{
ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed.";
}
- return handle;
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+ return tensorHandle;
}
std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
DataLayout dataLayout,
const bool IsMemoryManaged) const
{
- IgnoreUnused(dataLayout);
- std::unique_ptr<GpuFsaTensorHandle> handle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, dataLayout);
if (!IsMemoryManaged)
{
ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed.";
}
- return handle;
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+ return tensorHandle;
+}
+
+const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic()
+{
+ static const FactoryId s_Id(GpuFsaTensorHandleFactoryId());
+ return s_Id;
}
const FactoryId& GpuFsaTensorHandleFactory::GetId() const
@@ -71,7 +96,7 @@ const FactoryId& GpuFsaTensorHandleFactory::GetId() const
bool GpuFsaTensorHandleFactory::SupportsSubTensors() const
{
- return false;
+ return true;
}
MemorySourceFlags GpuFsaTensorHandleFactory::GetExportFlags() const
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
index 9f88de598b..93a44259f6 100644
--- a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
+++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
@@ -1,14 +1,13 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
-
#pragma once
-#include "GpuFsaMemoryManager.hpp"
-
#include <armnn/backends/ITensorHandleFactory.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+
namespace armnn
{
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
index 687c8c0ac8..6d13879f51 100644
--- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
@@ -1,10 +1,10 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
+
#include <Layer.hpp>
-#include <armnn/backends/MemCopyWorkload.hpp>
-#include <armnn/backends/TensorHandle.hpp>
+
#include "GpuFsaWorkloadFactory.hpp"
#include "GpuFsaBackendId.hpp"
#include "GpuFsaTensorHandle.hpp"
@@ -17,11 +17,9 @@ namespace
static const BackendId s_Id{GpuFsaBackendId()};
}
template <typename QueueDescriptorType>
-std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& descriptor,
- const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& /*descriptor*/,
+ const WorkloadInfo& /*info*/) const
{
- IgnoreUnused(descriptor);
- IgnoreUnused(info);
return nullptr;
}
@@ -64,51 +62,29 @@ bool GpuFsaWorkloadFactory::IsLayerSupported(const Layer& layer,
return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported);
}
-bool GpuFsaWorkloadFactory::IsLayerSupported(const IConnectableLayer& layer,
- Optional<DataType> dataType,
- std::string& outReasonIfUnsupported,
- const ModelOptions& modelOptions)
-{
- return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported, modelOptions);
-}
-
std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
- const bool isMemoryManaged) const
+ const bool /*isMemoryManaged*/) const
{
- if (isMemoryManaged)
- {
- return std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
- }
- else
- {
- return std::make_unique<GpuFsaTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
- }
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo);
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+
+ return tensorHandle;
}
std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
DataLayout dataLayout,
- const bool isMemoryManaged) const
+ const bool /*isMemoryManaged*/) const
{
- IgnoreUnused(dataLayout);
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, dataLayout);
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
- if (isMemoryManaged)
- {
- return std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
- }
- else
- {
- return std::make_unique<GpuFsaTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
- }
+ return tensorHandle;
}
-std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType type,
- const QueueDescriptor& descriptor,
- const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType /*type*/,
+ const QueueDescriptor& /*descriptor*/,
+ const WorkloadInfo& /*info*/) const
{
- IgnoreUnused(type);
- IgnoreUnused(descriptor);
- IgnoreUnused(info);
-
return nullptr;
}
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
index 0d80f0363c..9b97070766 100644
--- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
@@ -1,14 +1,12 @@
//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
-#include "GpuFsaMemoryManager.hpp"
+#include <aclCommon/BaseMemoryManager.hpp>
#include <armnn/Optional.hpp>
-#include <armnn/backends/WorkloadFactory.hpp>
-#include <armnn/utility/IgnoreUnused.hpp>
namespace armnn
{
@@ -28,19 +26,13 @@ public:
Optional<DataType> dataType,
std::string& outReasonIfUnsupported);
- static bool IsLayerSupported(const IConnectableLayer& layer,
- Optional<DataType> dataType,
- std::string& outReasonIfUnsupported,
- const ModelOptions& modelOptions);
-
bool SupportsSubTensors() const override { return false; }
ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateSubTensorHandle instead")
- std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& parent,
- TensorShape const& subTensorShape,
- unsigned int const* subTensorOrigin) const override
+ std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& /*parent*/,
+ TensorShape const& /*subTensorShape*/,
+ unsigned int const* /*subTensorOrigin*/) const override
{
- IgnoreUnused(parent, subTensorShape, subTensorOrigin);
return nullptr;
}
diff --git a/src/backends/gpuFsa/backend.cmake b/src/backends/gpuFsa/backend.cmake
index 589af19c22..2f4f5fbc7b 100644
--- a/src/backends/gpuFsa/backend.cmake
+++ b/src/backends/gpuFsa/backend.cmake
@@ -1,12 +1,12 @@
#
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/gpuFsa)
list(APPEND armnnLibraries armnnGpuFsaBackend)
-if(ARMNNGPUFSA)
+if(ARMCOMPUTEGPUFSA)
list(APPEND armnnLibraries armnnGpuFsaBackendWorkloads)
list(APPEND armnnUnitTestLibraries armnnGpuFsaBackendUnitTests)
else()
diff --git a/src/backends/gpuFsa/backend.mk b/src/backends/gpuFsa/backend.mk
index 840e10338c..78ba7ba167 100644
--- a/src/backends/gpuFsa/backend.mk
+++ b/src/backends/gpuFsa/backend.mk
@@ -1,5 +1,5 @@
#
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
@@ -8,23 +8,23 @@
# file in the root of ArmNN
# The variable to enable/disable the GPU Dynamic Fusion backend
-# (ARMNN_GPU_FSA_ENABLED is declared in android-nn-driver/Android.mk)
-ifeq ($(ARMNN_GPU_FSA_ENABLED),1)
+# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk)
+ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1)
-# ARMNN_GPU_FSA_ENABLED == 1
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 1
# Include the source files for the GPU Dynamic Fusion backend
BACKEND_SOURCES := \
GpuFsaBackend.cpp \
+ GpuFsaBackendContext.cpp \
+ GpuFsaContextControl.cpp \
GpuFsaLayerSupport.cpp \
- GpuFsaMemoryManager.cpp \
GpuFsaRegistryInitializer.cpp \
- GpuFsaTensorHandle.cpp \
GpuFsaTensorHandleFactory.cpp \
GpuFsaWorkloadFactory.cpp
else
-# ARMNN_GPU_FSA_ENABLED == 0
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
# No source file will be compiled for the GPU Dynamic Fusion backend
BACKEND_SOURCES :=
@@ -36,10 +36,10 @@ endif
# up by the Android.mk file in the root of ArmNN
# The variable to enable/disable the GPU Dynamic Fusion backend
-# (ARMNN_GPU_FSA_ENABLED is declared in android-nn-driver/Android.mk)
-ifeq ($(ARMNN_GPU_FSA_ENABLED),1)
+# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk)
+ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1)
-# ARMNN_GPU_FSA_ENABLED == 1
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 1
# Include the source files for the GPU Dynamic Fusion backend tests
BACKEND_TEST_SOURCES := \
@@ -49,7 +49,7 @@ BACKEND_TEST_SOURCES := \
test/GpuFsaOptimizedNetworkTests.cpp
else
-# ARMNN_GPU_FSA_ENABLED == 0
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
# No source file will be compiled for the GPU Dynamic Fusion backend tests
BACKEND_TEST_SOURCES :=
diff --git a/src/backends/gpuFsa/test/CMakeLists.txt b/src/backends/gpuFsa/test/CMakeLists.txt
index c600589768..66091e90df 100644
--- a/src/backends/gpuFsa/test/CMakeLists.txt
+++ b/src/backends/gpuFsa/test/CMakeLists.txt
@@ -1,9 +1,10 @@
#
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
# SPDX-License-Identifier: MIT
#
list(APPEND armnnGpuFsaBackendUnitTests_sources
+ GpuFsaDefaultAllocatorTests.cpp
GpuFsaEndToEndTests.cpp
GpuFsaLayerTests.cpp
GpuFsaLayerSupportTests.cpp
diff --git a/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp
new file mode 100644
index 0000000000..1f603e2718
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp
@@ -0,0 +1,193 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/backends/ICustomAllocator.hpp>
+#include <armnn/BackendRegistry.hpp>
+#include <armnn/Descriptors.hpp>
+#include <armnn/Exceptions.hpp>
+#include <armnn/IRuntime.hpp>
+#include <armnn/backends/TensorHandle.hpp>
+// Requires the OpenCl backend to be included (GpuFsa)
+#include <gpuFsa/GpuFsaBackend.hpp>
+#include <doctest/doctest.h>
+#include <backendsCommon/DefaultAllocator.hpp>
+#include <armnnTestUtils/MockBackend.hpp>
+#include <gpuFsa/GpuFsaBackendDefaultAllocator.hpp>
+
+using namespace armnn;
+
+namespace
+{
+
+TEST_SUITE("DefaultAllocatorTests")
+{
+
+TEST_CASE("DefaultAllocatorTest")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<DefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("DefaultAllocatorTestMulti")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<DefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+ void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto* inputPtr2 = reinterpret_cast<float*>(alignedInputPtr2);
+ std::fill_n(inputPtr2, numElements, number);
+ CHECK(inputPtr2[0] == 3);
+ CHECK(inputPtr2[1] == 3);
+
+ // No overlap
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("DefaultAllocatorTestMock")
+{
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Initialize Mock Backend
+ MockBackendInitialiser initialiser;
+ auto factoryFun = BackendRegistryInstance().GetFactory(MockBackend().GetIdStatic());
+ ARMNN_ASSERT(factoryFun != nullptr);
+ auto backend = factoryFun();
+ auto defaultAllocator = backend->GetDefaultAllocator();
+
+ // GetMemorySourceType
+ CHECK(defaultAllocator->GetMemorySourceType() == MemorySource::Malloc);
+
+ size_t totalBytes = 1 * sizeof(float);
+ // Allocate
+ void* ptr = defaultAllocator->allocate(totalBytes, 0);
+
+ // GetMemoryRegionAtOffset
+ CHECK(defaultAllocator->GetMemoryRegionAtOffset(ptr, 0, 0));
+
+ // Free
+ defaultAllocator->free(ptr);
+
+ // Clean up
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.Deregister(MockBackend().GetIdStatic());
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+}
+
+
+TEST_SUITE("GpuFsaDefaultAllocatorTests")
+{
+
+TEST_CASE("GpuFsaDefaultAllocatorTest")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<GpuFsaBackendDefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("GpuFsaDefaultAllocatorTestMulti")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<GpuFsaBackendDefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+ void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto* inputPtr2 = reinterpret_cast<float*>(alignedInputPtr2);
+ std::fill_n(inputPtr2, numElements, number);
+ CHECK(inputPtr2[0] == 3);
+ CHECK(inputPtr2[1] == 3);
+
+ // No overlap
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+}
+
+} // namespace armnn \ No newline at end of file