aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Monahan <david.monahan@arm.com>2023-11-22 13:24:25 +0000
committerDavid Monahan <david.monahan@arm.com>2023-12-07 15:21:09 +0000
commit8a570466aca7ae1619fe8fa715b68419fceb142f (patch)
tree22d80676e01f4a92fc6d927b6c26d6e5939c5170
parent748657f2941d28bec810b7eec21e46e288002036 (diff)
downloadarmnn-8a570466aca7ae1619fe8fa715b68419fceb142f.tar.gz
IVGCVSW-8157 - Rebase existing GpuFsa patches to 23.11
Squashed commit of the following: IVGCVSW-7159 Add GpuFsa backend skeleton IVGCVSW-7380 Update the GpuFsa Skeleton to build and load ACL IVGCVSW-7381 Add IsLayerSupported implementation to GpuFsa backend IVGCVSW-7382 Implementation of Conv2d within GpuFsa Signed-off-by: James Conroy <james.conroy@arm.com> Signed-off-by: Matthew Sloyan <matthew.sloyan@arm.com> Signed-off-by: David Monahan <david.monahan@arm.com> Change-Id: Id23d9ee598535de7b38a99ca223cdf0ad2102cef
-rw-r--r--CMakeLists.txt2
-rw-r--r--cmake/GlobalConfig.cmake18
-rw-r--r--src/armnn/Network.cpp12
-rw-r--r--src/backends/aclCommon/ArmComputeTensorUtils.hpp10
-rw-r--r--src/backends/aclCommon/BaseMemoryManager.cpp14
-rw-r--r--src/backends/aclCommon/BaseMemoryManager.hpp32
-rw-r--r--src/backends/aclCommon/CMakeLists.txt2
-rw-r--r--src/backends/aclCommon/common.cmake4
-rw-r--r--src/backends/gpuFsa/CMakeLists.txt44
-rw-r--r--src/backends/gpuFsa/GpuFsaBackend.cpp310
-rw-r--r--src/backends/gpuFsa/GpuFsaBackend.hpp285
-rw-r--r--src/backends/gpuFsa/GpuFsaBackendContext.cpp233
-rw-r--r--src/backends/gpuFsa/GpuFsaBackendContext.hpp47
-rw-r--r--src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp51
-rw-r--r--src/backends/gpuFsa/GpuFsaBackendId.hpp12
-rw-r--r--src/backends/gpuFsa/GpuFsaContextControl.cpp169
-rw-r--r--src/backends/gpuFsa/GpuFsaContextControl.hpp42
-rw-r--r--src/backends/gpuFsa/GpuFsaLayerSupport.cpp111
-rw-r--r--src/backends/gpuFsa/GpuFsaLayerSupport.hpp24
-rw-r--r--src/backends/gpuFsa/GpuFsaMemoryManager.cpp120
-rw-r--r--src/backends/gpuFsa/GpuFsaMemoryManager.hpp59
-rw-r--r--src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp21
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandle.cpp188
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandle.hpp361
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp112
-rw-r--r--src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp55
-rw-r--r--src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp91
-rw-r--r--src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp59
-rw-r--r--src/backends/gpuFsa/backend.cmake15
-rw-r--r--src/backends/gpuFsa/backend.mk58
-rw-r--r--src/backends/gpuFsa/layerValidators/CMakeLists.txt14
-rw-r--r--src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp126
-rw-r--r--src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp28
-rw-r--r--src/backends/gpuFsa/test/CMakeLists.txt19
-rw-r--r--src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp193
-rw-r--r--src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp8
-rw-r--r--src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp64
-rw-r--r--src/backends/gpuFsa/test/GpuFsaLayerTests.cpp12
-rw-r--r--src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp137
-rw-r--r--src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp45
-rw-r--r--src/backends/gpuFsa/workloads/CMakeLists.txt16
-rw-r--r--src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp39
42 files changed, 3242 insertions, 20 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04b71513b0..29a72e36eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -523,7 +523,7 @@ install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install(DIRECTORY profiling/common/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/armnn/profiling/common/include)
install(DIRECTORY profiling/client/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/armnn/profiling/client/include)
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
target_link_libraries(armnn PUBLIC ${ARMCOMPUTE_LIBRARIES})
endif()
diff --git a/cmake/GlobalConfig.cmake b/cmake/GlobalConfig.cmake
index 008d151c0a..b37417c7cf 100644
--- a/cmake/GlobalConfig.cmake
+++ b/cmake/GlobalConfig.cmake
@@ -11,6 +11,7 @@ option(ARMNN_SAMPLE_APPS_ENABLED "Build Sample ArmNN Applications" ON)
option(BUILD_FOR_COVERAGE "Use no optimization and output .gcno and .gcda files" OFF)
option(ARMCOMPUTENEON "Build with ARM Compute NEON support" OFF)
option(ARMCOMPUTECL "Build with ARM Compute OpenCL support" OFF)
+option(ARMCOMPUTEGPUFSA "Build with GPU Dynamic Fusion Backend" OFF)
option(ARMNNREF "Build with ArmNN reference support" ON)
option(ARMNNTOSAREF "Build with TOSA reference support" OFF)
option(PROFILING_BACKEND_STREAMLINE "Forward the armNN profiling events to DS-5/Streamline as annotations" OFF)
@@ -286,7 +287,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/profiling)
# ARM Compute
# Note that ARM Compute has a different folder layout depending on the branch but also on
# whether it comes from a prepackaged archive (this is why we add several hints below)
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
find_path(ARMCOMPUTE_INCLUDE arm_compute/core/CL/OpenCL.h
PATHS ${ARMCOMPUTE_ROOT}/include
PATHS ${ARMCOMPUTE_ROOT}/applications/arm_compute
@@ -343,7 +344,7 @@ if(ARMCOMPUTENEON)
endif()
# ARM Compute OpenCL backend
-if(ARMCOMPUTECL)
+if(ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
# verify we have a valid flatbuffers include path
find_path(FLATBUFFERS_INCLUDE_PATH flatbuffers/flatbuffers.h
HINTS ${FLATBUFFERS_ROOT}/include /usr/local/include /usr/include)
@@ -367,15 +368,22 @@ if(ARMCOMPUTECL)
include_directories(SYSTEM ${OPENCL_INCLUDE})
- # Add preprocessor definition for ARM Compute OpenCL
- add_definitions(-DARMCOMPUTECL_ENABLED)
+ if(ARMCOMPUTECL)
+ # Add preprocessor definition for ARM Compute OpenCL
+ add_definitions(-DARMCOMPUTECL_ENABLED)
+ endif()
+
+ if(ARMCOMPUTEGPUFSA)
+ # Add preprocessor definition for ARM Compute OpenCL with Fusion
+ add_definitions(-DARMCOMPUTEGPUFSA_ENABLED)
+ endif()
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DARM_COMPUTE_DEBUG_ENABLED")
endif()
# Used by both Arm Compute backends, but should be added
# to the search path after the system directories if necessary
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
find_path(HALF_INCLUDE half/half.hpp)
find_path(HALF_INCLUDE half/half.hpp
PATHS ${ARMCOMPUTE_ROOT}/include
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index f18c6bfb48..4f82f20aa2 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -1906,6 +1906,18 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph,
ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
profiler->EnableProfiling(options.GetProfilingEnabled());
+ // Some backends don't play well together. Check here before continuing.
+ {
+ std::set<BackendId> backendSet(backendPreferences.begin(), backendPreferences.end());
+ // GpuFsa cannot co-exist with GpuAcc.
+ if (backendSet.find("GpuFsa") != backendSet.end() &&
+ backendSet.find("GpuAcc") != backendSet.end())
+ {
+ throw InvalidArgumentException("The backends \"GpuAcc\" and \"GpuFsa\" cannot be specified "
+ "for the same optimized network.");
+ }
+ }
+
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Optimizer");
if (backendPreferences.empty())
{
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.hpp b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
index f5ae770d6b..d8a41fe41f 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
@@ -121,6 +121,16 @@ arm_compute::PadStrideInfo BuildArmComputePadStrideInfo(const Descriptor& descri
arm_compute::DimensionRoundingType::FLOOR);
}
+/// Utility function used to setup an arm_compute::Padding2D object from an armnn layer descriptor.
+template <typename Descriptor>
+arm_compute::Padding2D BuildArmComputePaddingInfo(const Descriptor &descriptor)
+{
+ return arm_compute::Padding2D(descriptor.m_PadLeft,
+ descriptor.m_PadRight,
+ descriptor.m_PadTop,
+ descriptor.m_PadBottom);
+}
+
/// Utility function used to setup an arm_compute::CropInfo object from an ArmNN layer descriptor.
template <typename Descriptor>
arm_compute::CropInfo BuildArmComputeCropInfo(const Descriptor& descriptor, const unsigned int rank = 4)
diff --git a/src/backends/aclCommon/BaseMemoryManager.cpp b/src/backends/aclCommon/BaseMemoryManager.cpp
index c60a4a04ae..206cf9b230 100644
--- a/src/backends/aclCommon/BaseMemoryManager.cpp
+++ b/src/backends/aclCommon/BaseMemoryManager.cpp
@@ -1,10 +1,10 @@
//
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017-2023 Arm Ltd. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "BaseMemoryManager.hpp"
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
#include "arm_compute/runtime/BlobLifetimeManager.h"
#include "arm_compute/runtime/PoolManager.h"
#include "arm_compute/runtime/OffsetLifetimeManager.h"
@@ -14,7 +14,7 @@
namespace armnn
{
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
BaseMemoryManager::BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc,
MemoryAffinity memoryAffinity)
{
@@ -104,4 +104,12 @@ ClMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryMana
}
#endif
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+std::shared_ptr<arm_compute::IMemoryGroup>
+GpuFsaMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+{
+ return std::make_shared<arm_compute::MemoryGroup>(memoryManager);
+}
+#endif
+
}
diff --git a/src/backends/aclCommon/BaseMemoryManager.hpp b/src/backends/aclCommon/BaseMemoryManager.hpp
index af099f900a..04e0d640ab 100644
--- a/src/backends/aclCommon/BaseMemoryManager.hpp
+++ b/src/backends/aclCommon/BaseMemoryManager.hpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017-2023 Arm Ltd. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
@@ -7,17 +7,13 @@
#include <armnn/backends/IMemoryManager.hpp>
#include <armnn/backends/WorkloadFactory.hpp>
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
-#include <arm_compute/runtime/MemoryGroup.h>
-#endif
-
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
#include <arm_compute/runtime/IAllocator.h>
#include <arm_compute/runtime/IMemoryGroup.h>
#include <arm_compute/runtime/MemoryManagerOnDemand.h>
#endif
-#if defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
#include <arm_compute/runtime/CL/CLTensorAllocator.h>
#endif
@@ -39,7 +35,7 @@ public:
void Acquire() override;
void Release() override;
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc, MemoryAffinity memoryAffinity);
std::shared_ptr<arm_compute::MemoryManagerOnDemand>& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; }
@@ -98,4 +94,24 @@ protected:
};
#endif
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+class GpuFsaMemoryManager : public BaseMemoryManager
+{
+public:
+ GpuFsaMemoryManager() {}
+ virtual ~GpuFsaMemoryManager() {}
+
+ GpuFsaMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc)
+ : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer)
+ {
+ arm_compute::CLTensorAllocator::set_global_allocator(alloc.get());
+ m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr);
+ }
+
+protected:
+ std::shared_ptr<arm_compute::IMemoryGroup>
+ CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) override;
+};
+#endif
+
} //namespace armnn
diff --git a/src/backends/aclCommon/CMakeLists.txt b/src/backends/aclCommon/CMakeLists.txt
index eaaed65c17..0e17982946 100644
--- a/src/backends/aclCommon/CMakeLists.txt
+++ b/src/backends/aclCommon/CMakeLists.txt
@@ -14,7 +14,7 @@ list(APPEND armnnAclCommon_sources
IClTensorHandle.hpp
)
-if(ARMCOMPUTECL)
+if(ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
list(APPEND armnnAclCommon_sources
ArmComputeTuningUtils.hpp
ArmComputeTuningUtils.cpp
diff --git a/src/backends/aclCommon/common.cmake b/src/backends/aclCommon/common.cmake
index 89be236a7f..0acbb201b5 100644
--- a/src/backends/aclCommon/common.cmake
+++ b/src/backends/aclCommon/common.cmake
@@ -1,9 +1,9 @@
#
-# Copyright © 2017 Arm Ltd. All rights reserved.
+# Copyright © 2017-2023 Arm Ltd. All rights reserved.
# SPDX-License-Identifier: MIT
#
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/aclCommon)
list(APPEND armnnLibraries armnnAclCommon)
list(APPEND armnnUnitTestLibraries armnnAclCommonUnitTests)
diff --git a/src/backends/gpuFsa/CMakeLists.txt b/src/backends/gpuFsa/CMakeLists.txt
new file mode 100644
index 0000000000..8d1a58ee27
--- /dev/null
+++ b/src/backends/gpuFsa/CMakeLists.txt
@@ -0,0 +1,44 @@
+#
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+if(ARMCOMPUTEGPUFSA)
+ list(APPEND armnnGpuFsaBackend_sources
+ GpuFsaBackend.cpp
+ GpuFsaBackend.hpp
+ GpuFsaBackendContext.cpp
+ GpuFsaBackendContext.hpp
+ GpuFsaBackendDefaultAllocator.hpp
+ GpuFsaBackendId.hpp
+ GpuFsaContextControl.cpp
+ GpuFsaContextControl.hpp
+ GpuFsaLayerSupport.cpp
+ GpuFsaLayerSupport.hpp
+ GpuFsaRegistryInitializer.cpp
+ GpuFsaTensorHandle.hpp
+ GpuFsaTensorHandleFactory.cpp
+ GpuFsaTensorHandleFactory.hpp
+ GpuFsaWorkloadFactory.cpp
+ GpuFsaWorkloadFactory.hpp
+ )
+
+ add_subdirectory(layerValidators)
+ add_subdirectory(workloads)
+
+ if(BUILD_UNIT_TESTS)
+ add_subdirectory(test)
+ endif()
+
+else()
+ list(APPEND armnnGpuFsaBackend_sources
+ GpuFsaBackendId.hpp
+ GpuFsaLayerSupport.cpp
+ GpuFsaLayerSupport.hpp
+ )
+endif()
+
+add_library(armnnGpuFsaBackend OBJECT ${armnnGpuFsaBackend_sources})
+target_include_directories(armnnGpuFsaBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnGpuFsaBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
+target_include_directories(armnnGpuFsaBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/backends) \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp
new file mode 100644
index 0000000000..8ea9e8e7d3
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackend.cpp
@@ -0,0 +1,310 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaBackend.hpp"
+#include "GpuFsaBackendContext.hpp"
+#include "GpuFsaBackendDefaultAllocator.hpp"
+#include "GpuFsaBackendId.hpp"
+#include "GpuFsaLayerSupport.hpp"
+#include "GpuFsaTensorHandleFactory.hpp"
+#include "GpuFsaWorkloadFactory.hpp"
+
+#include <armnn/backends/IBackendContext.hpp>
+#include <armnn/backends/IMemoryManager.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+#include <backendsCommon/SubgraphUtils.hpp>
+#include <Optimizer.hpp>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
+
+#include "layerValidators/GpuFsaConvolution2dValidate.hpp"
+
+namespace armnn
+{
+
+template <typename T>
+inline void DeleteAsType(const void* const blob)
+{
+ delete static_cast<const T*>(blob);
+}
+
+inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
+{
+ SubgraphView::InputSlots result;
+ for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
+ {
+ result.push_back(&(*it));
+ }
+ return result;
+}
+
+inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
+{
+ SubgraphView::OutputSlots result;
+ for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
+ {
+ result.push_back(&(*it));
+ }
+ return result;
+}
+
+inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
+ SubgraphView::OutputSlots&& outputs,
+ SubgraphView::Layers&& layers)
+{
+ return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
+}
+
+const BackendId& GpuFsaBackend::GetIdStatic()
+{
+ static const BackendId s_Id{GpuFsaBackendId()};
+ return s_Id;
+}
+
+IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
+{
+ if (m_UsingCustomAllocator)
+ {
+ return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+}
+
+IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
+ const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
+{
+ return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
+}
+
+IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
+ TensorHandleFactoryRegistry& registry) const
+{
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
+
+ return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
+}
+
+IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
+ TensorHandleFactoryRegistry& registry,
+ const ModelOptions&,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags) const
+{
+
+ // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+ if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+ if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
+
+ return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
+}
+
+std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
+{
+ return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
+}
+
+void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
+{
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
+
+}
+
+void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags)
+{
+ // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+ if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+ if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+
+ std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+ if (m_UsingCustomAllocator)
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+ }
+ else
+ {
+ memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+ }
+
+ std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+ registry.RegisterMemoryManager(memoryManager);
+ registry.RegisterFactory(std::move(factory));
+}
+
+IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
+{
+ return IBackendContextPtr{new GpuFsaBackendContext{options}};
+}
+
+IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
+ const IRuntime::CreationOptions&, IBackendProfilingPtr&)
+{
+ return IBackendProfilingContextPtr{};
+}
+
+IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
+{
+ static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
+ return layerSupport;
+}
+
+std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
+{
+ return std::make_unique<GpuFsaBackendDefaultAllocator>();
+}
+
+OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
+ const ModelOptions& modelOptions) const
+{
+ OptimizationViews optimizationViews(modelOptions);
+
+ using namespace arm_compute::experimental::dynamic_fusion;
+ // Create a new workload sketch, for validation purposes
+ auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context();
+ auto gpuCtx = GpuWorkloadContext(&compileCtx);
+
+ auto it = subgraph.end();
+ std::map<LayerGuid, Layer*> untouched;
+ while (it != subgraph.begin())
+ {
+ --it;
+ Layer& base = *(PolymorphicDowncast<Layer*>(*it));
+ untouched.insert({base.GetGuid(), &base});
+ }
+
+ GpuFsaLayerSupport supportChecker;
+ it = subgraph.end();
+ while (it != subgraph.begin())
+ {
+ --it;
+ Layer& base = *(PolymorphicDowncast<Layer*>(*it));
+
+ std::unique_ptr<GpuWorkloadSketch> sketch = std::make_unique<GpuWorkloadSketch>(&gpuCtx);
+ switch (base.GetType())
+ {
+ case (LayerType::Convolution2d):
+ {
+ auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+ auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
+ //std::vector<TensorInfo> infos = {input, weights};
+
+ auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
+ if (desc->m_BiasEnabled)
+ {
+ auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
+ GpuFsaConvolution2dCreateOp(input,
+ *desc,
+ weights,
+ bias);
+ }
+ else
+ {
+ GpuFsaConvolution2dCreateOp(input,
+ *desc,
+ weights,
+ EmptyOptional());
+ }
+ break;
+ }
+ default:
+ // unsupported layer for GpuFsa backend
+ continue;
+ }
+
+ auto compiledBlob = std::make_unique<PreCompiledObjectPtr>(sketch.release(), DeleteAsType<GpuWorkloadSketch>);
+
+ IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
+ PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
+ std::move(*compiledBlob),
+ armnn::Optional<BackendId>(GetId()),
+ "GpuFsa_Pre_Compiled_Layer");
+
+ // Copy the output tensor infos from sub-graph
+ for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
+ {
+ preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
+ }
+
+ SubgraphView::SubgraphViewPtr substituteSubgraph =
+ CreateSubgraphViewFrom(CreateInputsFrom(&base),
+ CreateOutputsFrom(&base),
+ {&base});
+
+ optimizationViews.AddSubstitution({ *substituteSubgraph, SubgraphView(preCompiledLayer) });
+
+ untouched.erase(base.GetGuid());
+ }
+
+ if (optimizationViews.GetSubstitutions().empty())
+ {
+ optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
+ }
+ else
+ {
+ ReportUntouchedLayers(optimizationViews, untouched);
+ }
+
+
+ return optimizationViews;
+}
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp
new file mode 100644
index 0000000000..26960065c7
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackend.hpp
@@ -0,0 +1,285 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+
+// System includes for mapping and unmapping memory
+#include <sys/mman.h>
+
+namespace armnn
+{
+
+// add new capabilities here..
+const BackendCapabilities gpuFsaCapabilities("GpuFsa",
+ {
+ {"NonConstWeights", false},
+ {"AsyncExecution", false},
+ {"ProtectedContentAllocation", false},
+ {"ConstantTensorsAsInputs", false},
+ {"PreImportIOTensors", false},
+ {"ExternallyManagedMemory", false},
+ {"MultiAxisPacking", false},
+ {"SingleAxisPacking", false}
+ });
+
+class GpuFsaBackend : public IBackendInternal
+{
+public:
+ GpuFsaBackend() : m_CustomAllocator(nullptr) {};
+ GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)
+ {
+ UseCustomMemoryAllocator(allocator, armnn::EmptyOptional());
+ }
+ ~GpuFsaBackend() = default;
+
+ static const BackendId& GetIdStatic();
+ const BackendId& GetId() const override { return GetIdStatic(); }
+
+ IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;
+
+ IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
+ const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
+
+ IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;
+
+ IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
+ const ModelOptions& modelOptions,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags) const override;
+
+ std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+
+ void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;
+
+ void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+ MemorySourceFlags inputFlags,
+ MemorySourceFlags outputFlags) override;
+
+ IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+ IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(
+ const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;
+
+ IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;
+
+ OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,
+ const ModelOptions& modelOptions) const override;
+
+ std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
+
+ BackendCapabilities GetCapabilities() const override
+ {
+ return gpuFsaCapabilities;
+ };
+
+ virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+ armnn::Optional<std::string&>) override
+ {
+ ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";
+
+ // Set flag to signal the backend to use a custom memory allocator
+ m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));
+ m_UsingCustomAllocator = true;
+ return m_UsingCustomAllocator;
+ }
+
+ // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
+ class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator
+ {
+ public:
+ GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
+ {}
+ // Inherited methods overridden:
+ void* allocate(size_t size, size_t alignment) override
+ {
+ auto alloc = m_CustomAllocator->allocate(size, alignment);
+ return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
+ }
+ void free(void* ptr) override
+ {
+ auto hostMemPtr = m_AllocatedBufferMappings[ptr];
+ clReleaseMemObject(static_cast<cl_mem>(ptr));
+ m_CustomAllocator->free(hostMemPtr);
+ }
+ std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
+ {
+ auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
+ cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
+
+ return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),
+ hostMemPtr,
+ m_CustomAllocator->GetMemorySourceType());
+ }
+ private:
+ cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
+ {
+ // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+ auto cachelineAlignment =
+ arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+ auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
+
+ if (source == MemorySource::Malloc)
+ {
+ const cl_import_properties_arm importProperties[] =
+ {
+ CL_IMPORT_TYPE_ARM,
+ CL_IMPORT_TYPE_HOST_ARM,
+ 0
+ };
+ cl_int error = CL_SUCCESS;
+ cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+ CL_MEM_READ_WRITE,
+ importProperties,
+ memory,
+ roundedSize,
+ &error);
+ if (error == CL_SUCCESS)
+ {
+ m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+ return buffer;
+ }
+ throw armnn::Exception(
+ "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
+ }
+ else if (source == MemorySource::DmaBuf)
+ {
+ const cl_import_properties_arm importProperties[] =
+ {
+ CL_IMPORT_TYPE_ARM,
+ CL_IMPORT_TYPE_DMA_BUF_ARM,
+ CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,
+ CL_TRUE,
+ 0
+ };
+ cl_int error = CL_SUCCESS;
+ cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+ CL_MEM_READ_WRITE,
+ importProperties,
+ memory,
+ roundedSize,
+ &error);
+ if (error == CL_SUCCESS)
+ {
+ m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+ return buffer;
+ }
+ throw armnn::Exception(
+ "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+ + std::to_string(error));
+ }
+ else if (source == MemorySource::DmaBufProtected)
+ {
+ const cl_import_properties_arm importProperties[] =
+ {
+ CL_IMPORT_TYPE_ARM,
+ CL_IMPORT_TYPE_DMA_BUF_ARM,
+ CL_IMPORT_TYPE_PROTECTED_ARM,
+ CL_TRUE,
+ 0
+ };
+ cl_int error = CL_SUCCESS;
+ cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+ CL_MEM_READ_WRITE,
+ importProperties,
+ memory,
+ roundedSize,
+ &error);
+ if (error == CL_SUCCESS)
+ {
+ m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+ return buffer;
+ }
+ throw armnn::Exception(
+ "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+ + std::to_string(error));
+ }
+ throw armnn::Exception(
+ "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
+ }
+ std::shared_ptr<ICustomAllocator> m_CustomAllocator;
+ std::map<void*, void*> m_AllocatedBufferMappings;
+ };
+
+ class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
+ {
+ public:
+ // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
+ ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)
+ : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+ {
+ _mem = buffer;
+ m_HostMemPtr = hostMemPtr;
+ m_MemorySource = source;
+ }
+
+ // Inherited methods overridden :
+ void* ptr() override
+ {
+ return nullptr;
+ }
+
+ void* map(cl::CommandQueue &q, bool blocking) override
+ {
+ armnn::IgnoreUnused(q, blocking);
+ if (m_HostMemPtr == nullptr)
+ {
+ throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
+ }
+ if (_mapping != nullptr)
+ {
+ throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");
+ }
+ switch (m_MemorySource)
+ {
+ case armnn::MemorySource::Malloc:
+ _mapping = m_HostMemPtr;
+ return _mapping;
+ break;
+ case armnn::MemorySource::DmaBuf:
+ case armnn::MemorySource::DmaBufProtected:
+ // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd
+ _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);
+ return _mapping;
+ break;
+ default:
+ throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");
+ break;
+ }
+ }
+
+ void unmap(cl::CommandQueue &q) override
+ {
+ armnn::IgnoreUnused(q);
+ switch (m_MemorySource)
+ {
+ case armnn::MemorySource::Malloc:
+ _mapping = nullptr;
+ break;
+ case armnn::MemorySource::DmaBuf:
+ case armnn::MemorySource::DmaBufProtected:
+ munmap(_mapping, _size);
+ _mapping = nullptr;
+ break;
+ default:
+ throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");
+ break;
+ }
+ }
+ private:
+ void* m_HostMemPtr = nullptr;
+ armnn::MemorySource m_MemorySource;
+ };
+
+ std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;
+ bool m_UsingCustomAllocator = false;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.cpp b/src/backends/gpuFsa/GpuFsaBackendContext.cpp
new file mode 100644
index 0000000000..84b948303a
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendContext.cpp
@@ -0,0 +1,233 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaBackendContext.hpp"
+#include "GpuFsaContextControl.hpp"
+
+#include <armnn/utility/Assert.hpp>
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
+#include <arm_compute/core/CL/OpenCL.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include <arm_compute/runtime/CL/CLTunerTypes.h>
+
+namespace armnn
+{
+
+struct GpuFsaBackendContext::GpuFsaContextControlWrapper
+{
+ GpuFsaContextControlWrapper(arm_compute::CLTuner* tuner,
+ arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle,
+ bool profilingEnabled)
+ : m_GpuFsaContextControl(tuner, heuristicsHandle, profilingEnabled)
+ {}
+
+ bool Sync()
+ {
+ if (arm_compute::CLScheduler::get().context()() != NULL)
+ {
+ // Waits for all queued CL requests to finish before unloading the network they may be using.
+ try
+ {
+ // Coverity fix: arm_compute::CLScheduler::sync() may throw an exception of type cl::Error.
+ arm_compute::CLScheduler::get().sync();
+ }
+ catch (const cl::Error& err)
+ {
+ ARMNN_LOG(warning) << "Runtime::UnloadNetwork(): an error occurred while waiting for "
+ "the queued CL requests to finish";
+ throw err;
+ }
+ }
+
+ return true;
+ }
+
+ void ClearClCache()
+ {
+ if (arm_compute::CLScheduler::get().context()() != NULL)
+ {
+ // There are no loaded networks left, so clear the CL cache to free up memory
+ m_GpuFsaContextControl.ClearClCache();
+ }
+ }
+
+ GpuFsaContextControl m_GpuFsaContextControl;
+};
+
+GpuFsaBackendContext::GpuFsaBackendContext(const IRuntime::CreationOptions& options)
+ : IBackendContext(options)
+ , m_TuningFile()
+{
+ bool kernelProfiling = options.m_EnableGpuProfiling;
+
+ arm_compute::CLTuner* tuner = nullptr;
+ arm_compute::CLGEMMHeuristicsHandle* mlgoTuner = nullptr;
+ bool useLegacyTunerAPI = options.m_GpuAccTunedParameters.get() != nullptr;
+ if (useLegacyTunerAPI)
+ {
+ auto clTunerParams = PolymorphicDowncast<ClTunedParameters*>(
+ options.m_GpuAccTunedParameters.get());
+ tuner = &clTunerParams->m_Tuner;
+
+ if (tuner)
+ {
+ auto ConvertTuningLevel = [](IGpuAccTunedParameters::TuningLevel level,
+ armnn::IGpuAccTunedParameters::Mode mode)
+ {
+ if (mode == armnn::IGpuAccTunedParameters::Mode::UseTunedParameters)
+ {
+ return TuningLevel::None;
+ }
+
+ switch(level)
+ {
+ case IGpuAccTunedParameters::TuningLevel::Rapid:
+ return TuningLevel::Rapid;
+ case IGpuAccTunedParameters::TuningLevel::Normal:
+ return TuningLevel::Normal;
+ case IGpuAccTunedParameters::TuningLevel::Exhaustive:
+ return TuningLevel::Exhaustive;
+ default:
+ {
+ ARMNN_LOG(warning) << "Tuning level not recognised.";
+ return TuningLevel::None;
+ }
+ }
+ };
+
+ TuningLevel tuningLevel = ConvertTuningLevel(clTunerParams->m_TuningLevel, clTunerParams->m_Mode);
+ ConfigureTuner(*tuner, tuningLevel);
+ }
+ }
+ else //New backend options API
+ {
+ const TuningLevel defaultTuningLevel = TuningLevel::None;
+ auto tuningLevel = defaultTuningLevel;
+
+ ParseOptions(options.m_BackendOptions, "GpuFsa", [&](std::string name, const BackendOptions::Var& value)
+ {
+ if (name == "KernelProfilingEnabled")
+ {
+ kernelProfiling |= ParseBooleanBackendOption(value, false);
+ } else if (name == "TuningFile")
+ {
+ m_TuningFile = ParseStringBackendOption(value, "");
+ } else if (name == "TuningLevel")
+ {
+ tuningLevel = ParseTuningLevel(value, defaultTuningLevel);
+ }
+ else if (name == "MLGOTuningFilePath")
+ {
+ m_MLGOTuningFile = ParseStringBackendOption(value, "");
+ }
+ });
+
+ // Create the tuner, in tuning mode initially.
+ m_Tuner = std::make_unique<arm_compute::CLTuner>(true);
+
+ ConfigureTuner(*(m_Tuner.get()), tuningLevel);
+
+ if (!m_TuningFile.empty())
+ {
+ try
+ {
+ ARMNN_LOG(info) << "Loading Gpu tuning data from file: " << m_TuningFile;
+ m_Tuner->load_from_file(m_TuningFile.c_str());
+ }
+ catch (const std::exception& e)
+ {
+ // Warn if not tuning, otherwise tuning will generate new params
+ if (tuningLevel == TuningLevel::None)
+ {
+ ARMNN_LOG(warning) << "Could not load GpuFsa tuner data file.";
+ }
+ }
+ }
+
+ if (!m_MLGOTuningFile.empty())
+ {
+ try
+ {
+ ARMNN_LOG(info) << "Loading Gpu MLGO tuning data from file: " << m_TuningFile;
+ if(m_MLGOTuner.reload_from_file(m_MLGOTuningFile.c_str()))
+ {
+ mlgoTuner = &m_MLGOTuner;
+ }
+ }
+ catch (const std::exception& e)
+ {
+ ARMNN_LOG(warning) << "Could not load GpuFsa MLGO tuner data file.";
+ }
+ }
+
+ tuner = m_Tuner.get();
+ }
+
+ m_GpuFsaContextControlWrapper = std::make_unique<GpuFsaContextControlWrapper>(
+ tuner,
+ mlgoTuner,
+ kernelProfiling
+ );
+}
+
+bool GpuFsaBackendContext::BeforeLoadNetwork(NetworkId)
+{
+ return true;
+}
+
+bool GpuFsaBackendContext::AfterLoadNetwork(NetworkId networkId)
+{
+ {
+ std::lock_guard<std::mutex> lockGuard(m_Mutex);
+ m_NetworkIds.insert(networkId);
+ }
+ return true;
+}
+
+bool GpuFsaBackendContext::BeforeUnloadNetwork(NetworkId)
+{
+ return m_GpuFsaContextControlWrapper->Sync();
+}
+
+bool GpuFsaBackendContext::AfterUnloadNetwork(NetworkId networkId)
+{
+ bool clearCache = false;
+ {
+ std::lock_guard<std::mutex> lockGuard(m_Mutex);
+ m_NetworkIds.erase(networkId);
+ clearCache = m_NetworkIds.empty();
+ }
+
+ if (clearCache)
+ {
+ m_GpuFsaContextControlWrapper->ClearClCache();
+ }
+
+ return true;
+}
+
+bool GpuFsaBackendContext::AfterEnqueueWorkload(NetworkId)
+{
+ return m_GpuFsaContextControlWrapper->Sync();
+}
+
+GpuFsaBackendContext::~GpuFsaBackendContext()
+{
+ if (m_Tuner && !m_TuningFile.empty())
+ {
+ try
+ {
+ m_Tuner->save_to_file(m_TuningFile.c_str());
+ }
+ catch(const std::exception& e)
+ {
+ ARMNN_LOG(warning) << "Could not save GpuFsa tuner data to file " << m_TuningFile;
+ }
+ }
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.hpp b/src/backends/gpuFsa/GpuFsaBackendContext.hpp
new file mode 100644
index 0000000000..271688fd99
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendContext.hpp
@@ -0,0 +1,47 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/backends/IBackendContext.hpp>
+#include <unordered_set>
+#include <mutex>
+
+#include <arm_compute/runtime/CL/CLTuner.h>
+#include <arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h>
+
+namespace armnn
+{
+
+class GpuFsaBackendContext : public IBackendContext
+{
+public:
+ GpuFsaBackendContext(const IRuntime::CreationOptions& options);
+
+ bool BeforeLoadNetwork(NetworkId networkId) override;
+ bool AfterLoadNetwork(NetworkId networkId) override;
+
+ bool BeforeUnloadNetwork(NetworkId networkId) override;
+ bool AfterUnloadNetwork(NetworkId networkId) override;
+
+ bool AfterEnqueueWorkload(NetworkId networkId) override;
+
+ ~GpuFsaBackendContext() override;
+
+private:
+ std::mutex m_Mutex;
+ struct GpuFsaContextControlWrapper;
+ std::unique_ptr<GpuFsaContextControlWrapper> m_GpuFsaContextControlWrapper;
+
+ std::unordered_set<NetworkId> m_NetworkIds;
+
+ std::unique_ptr<arm_compute::CLTuner> m_Tuner;
+ std::string m_TuningFile;
+
+protected:
+ arm_compute::CLGEMMHeuristicsHandle m_MLGOTuner;
+ std::string m_MLGOTuningFile;
+};
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp
new file mode 100644
index 0000000000..c57ff63b92
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp
@@ -0,0 +1,51 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <memory>
+
+#include <armnn/MemorySources.hpp>
+#include <armnn/utility/IgnoreUnused.hpp>
+
+namespace armnn
+{
+
+/**
+* Default Memory Allocator class returned from IBackendInternal::GetDefaultAllocator(MemorySource)
+*/
+class GpuFsaBackendDefaultAllocator : public ICustomAllocator
+{
+public:
+ GpuFsaBackendDefaultAllocator() = default;
+
+ void* allocate(size_t size, size_t alignment = 0) override
+ {
+ IgnoreUnused(alignment);
+ cl_mem buf{ clCreateBuffer(arm_compute::CLScheduler::get().context().get(),
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+ size,
+ nullptr,
+ nullptr)};
+ return static_cast<void *>(buf);
+ }
+
+ void free(void* ptr) override
+ {
+ ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+ clReleaseMemObject(static_cast<cl_mem>(ptr));
+ }
+
+ MemorySource GetMemorySourceType() override
+ {
+ return MemorySource::Gralloc;
+ }
+
+ void* GetMemoryRegionAtOffset(void* buffer, size_t offset, size_t alignment = 0) override
+ {
+ IgnoreUnused(alignment);
+ return static_cast<char*>(buffer) + offset;
+ }
+};
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackendId.hpp b/src/backends/gpuFsa/GpuFsaBackendId.hpp
new file mode 100644
index 0000000000..1231798bf0
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendId.hpp
@@ -0,0 +1,12 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+namespace armnn
+{
+
+constexpr const char * GpuFsaBackendId() { return "GpuFsa"; }
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaContextControl.cpp b/src/backends/gpuFsa/GpuFsaContextControl.cpp
new file mode 100644
index 0000000000..cc53356c0d
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaContextControl.cpp
@@ -0,0 +1,169 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaContextControl.hpp"
+
+#include <armnn/Exceptions.hpp>
+#include <armnn/utility/Assert.hpp>
+#include <LeakChecking.hpp>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <fmt/format.h>
+
+namespace cl
+{
+class Context;
+class CommandQueue;
+class Device;
+}
+
+namespace armnn
+{
+
+GpuFsaContextControl::GpuFsaContextControl(arm_compute::CLTuner *tuner,
+ arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle,
+ bool profilingEnabled)
+ : m_Tuner(tuner)
+ , m_HeuristicsHandle(heuristicsHandle)
+ , m_ProfilingEnabled(profilingEnabled)
+{
+ try
+ {
+ std::vector<cl::Platform> platforms;
+ cl::Platform::get(&platforms);
+
+ // Selects default platform for the first element.
+ cl::Platform::setDefault(platforms[0]);
+
+ std::vector<cl::Device> devices;
+ platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
+
+ // Selects default device for the first element.
+ cl::Device::setDefault(devices[0]);
+ }
+ catch (const cl::Error& clError)
+ {
+ throw ClRuntimeUnavailableException(fmt::format(
+ "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}",
+ clError.what(), clError.err()));
+ }
+
+ // Removes the use of global CL context.
+ cl::Context::setDefault(cl::Context{});
+ if (cl::Context::getDefault()() != NULL)
+ {
+ throw armnn::Exception("GpuFsaContextControl: Unable to remove the global CL context");
+ }
+
+ // Removes the use of global CL command queue.
+ cl::CommandQueue::setDefault(cl::CommandQueue{});
+ if (cl::CommandQueue::getDefault()() != NULL)
+ {
+ throw armnn::Exception("GpuFsaContextControl: Unable to remove the global CL command queue");
+ }
+
+ // Always load the OpenCL runtime.
+ LoadOpenClRuntime();
+}
+
+GpuFsaContextControl::~GpuFsaContextControl()
+{
+ // Load the OpencCL runtime without the tuned parameters to free the memory for them.
+ try
+ {
+ UnloadOpenClRuntime();
+ }
+ catch (const cl::Error& clError)
+ {
+ // This should not happen, it is ignored if it does.
+
+ // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an
+ // exception of type std::length_error.
+ // Using stderr instead in this context as there is no point in nesting try-catch blocks here.
+ std::cerr << "A CL error occurred unloading the runtime tuner parameters: "
+ << clError.what() << ". CL error code is: " << clError.err() << std::endl;
+ }
+}
+
+void GpuFsaContextControl::LoadOpenClRuntime()
+{
+ DoLoadOpenClRuntime(true);
+}
+
+void GpuFsaContextControl::UnloadOpenClRuntime()
+{
+ DoLoadOpenClRuntime(false);
+}
+
+void GpuFsaContextControl::DoLoadOpenClRuntime(bool updateTunedParameters)
+{
+ cl::Device device = cl::Device::getDefault();
+ cl::Context context;
+ cl::CommandQueue commandQueue;
+
+ if (arm_compute::CLScheduler::get().is_initialised() && arm_compute::CLScheduler::get().context()() != NULL)
+ {
+ // Wait for all queued CL requests to finish before reinitialising it.
+ arm_compute::CLScheduler::get().sync();
+ }
+
+ try
+ {
+ arm_compute::CLKernelLibrary::get().clear_programs_cache();
+ // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no
+ // context references); it is initialised again, with a proper context, later.
+ arm_compute::CLScheduler::get().init(context, commandQueue, device);
+ arm_compute::CLKernelLibrary::get().init(".", context, device);
+
+ {
+ //
+ // Here we replace the context with a new one in which
+ // the memory leak checks show it as an extra allocation but
+ // because of the scope of the leak checks, it doesn't count
+ // the disposal of the original object. On the other hand it
+ // does count the creation of this context which it flags
+ // as a memory leak. By adding the following line we prevent
+ // this to happen.
+ //
+ ARMNN_DISABLE_LEAK_CHECKING_IN_SCOPE();
+ context = cl::Context(device);
+ }
+
+ // NOTE: In this specific case profiling has to be enabled on the command queue
+ // in order for the CLTuner to work.
+ bool profilingNeededForClTuner = updateTunedParameters && m_Tuner &&
+ m_Tuner->tune_new_kernels();
+
+ if (m_ProfilingEnabled || profilingNeededForClTuner)
+ {
+ // Create a new queue with profiling enabled.
+ commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
+ }
+ else
+ {
+ // Use default queue.
+ commandQueue = cl::CommandQueue(context, device);
+ }
+ }
+ catch (const cl::Error& clError)
+ {
+ throw ClRuntimeUnavailableException(fmt::format(
+ "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}",
+ clError.what(), clError.err()));
+ }
+
+ // Note the first argument (path to cl source code) will be ignored as they should be embedded in the armcompute.
+ arm_compute::CLKernelLibrary::get().init(".", context, device);
+ arm_compute::CLScheduler::get().init(context, commandQueue, device, m_Tuner, m_HeuristicsHandle);
+}
+
+void GpuFsaContextControl::ClearClCache()
+{
+ DoLoadOpenClRuntime(true);
+}
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaContextControl.hpp b/src/backends/gpuFsa/GpuFsaContextControl.hpp
new file mode 100644
index 0000000000..f77b1fbdd4
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaContextControl.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <aclCommon/ArmComputeTuningUtils.hpp>
+
+namespace armnn
+{
+
+// ARM Compute OpenCL context control.
+class GpuFsaContextControl
+{
+public:
+
+ GpuFsaContextControl(arm_compute::CLTuner* = nullptr,
+ arm_compute::CLGEMMHeuristicsHandle* = nullptr,
+ bool profilingEnabled = false);
+
+ virtual ~GpuFsaContextControl();
+
+ void LoadOpenClRuntime();
+
+ // Users should call this (after freeing all of the cl::Context objects they use)
+ // to release the cached memory used by the compute library.
+ void UnloadOpenClRuntime();
+
+ // Clear the CL cache, without losing the tuned parameter settings.
+ void ClearClCache();
+
+private:
+
+ void DoLoadOpenClRuntime(bool updateTunedParameters);
+
+ arm_compute::CLTuner* m_Tuner;
+ arm_compute::CLGEMMHeuristicsHandle* m_HeuristicsHandle;
+
+ bool m_ProfilingEnabled;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaLayerSupport.cpp b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp
new file mode 100644
index 0000000000..063af2732e
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaLayerSupport.cpp
@@ -0,0 +1,111 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaLayerSupport.hpp"
+
+#include <armnn/Types.hpp>
+#include <armnn/utility/IgnoreUnused.hpp>
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+#include "layerValidators/GpuFsaConvolution2dValidate.hpp"
+#endif
+
+#include <vector>
+
+namespace armnn
+{
+
+template<typename ... Args>
+bool IsGpuFsaBackendSupported(Optional<std::string&> reasonIfUnsupported, Args... args)
+{
+ IgnoreUnused(reasonIfUnsupported, (args)...);
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+ return true;
+#else
+ if (reasonIfUnsupported)
+ {
+ reasonIfUnsupported.value() = "The armnn library has been built without CL support";
+ }
+ return false;
+#endif
+}
+
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+#define FORWARD_GPUFSA_LAYER_SUPPORT_FUNC(expr) (expr)
+#else
+#define FORWARD_GPUFSA_LAYER_SUPPORT_FUNC(expr) IsGpuFsaBackendSupported(reasonIfUnsupported)
+#endif
+
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+template<class FuncType, class... Args>
+inline bool CheckIsLayerSupported(FuncType&& func, Optional<std::string&> reasonIfUnsupported, Args&&... args)
+{
+ arm_compute::Status aclStatus = func(std::forward<Args>(args)...);
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ reasonIfUnsupported.value() = aclStatus.error_description();
+ }
+ return supported;
+}
+
+#define FORWARD_LAYER_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \
+ return CheckIsLayerSupported(func, reasonIfUnsupported, __VA_ARGS__);
+#else
+#define FORWARD_LAYER_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \
+ return IsGpuFsaBackendSupported(reasonIfUnsupported, __VA_ARGS__);
+#endif
+
+bool GpuFsaLayerSupport::IsLayerSupported(const LayerType& type,
+ const std::vector<TensorInfo>& infos,
+ const BaseDescriptor& descriptor,
+ const Optional<LstmInputParamsInfo>& lstmParamsInfo,
+ const Optional<QuantizedLstmInputParamsInfo>& quantizedLstmInputParamsInfo,
+ Optional<std::string&> reasonIfUnsupported) const
+{
+ IgnoreUnused(lstmParamsInfo);
+ IgnoreUnused(quantizedLstmInputParamsInfo);
+
+ switch (type) {
+ case LayerType::Convolution2d:
+ {
+ if (infos.size() != 4)
+ {
+ throw InvalidArgumentException("Invalid number of Convolution2d TensorInfos. "
+ "TensorInfos should be of format: {input, output, weights, biases}.");
+ }
+
+ auto desc = *(PolymorphicDowncast<const Convolution2dDescriptor*>(&descriptor));
+ if (infos[3] == TensorInfo())
+ {
+ FORWARD_LAYER_VALIDATE_FUNC(GpuFsaConvolution2dValidate,
+ reasonIfUnsupported,
+ infos[0],
+ desc,
+ infos[2],
+ EmptyOptional());
+ }
+ else
+ {
+ FORWARD_LAYER_VALIDATE_FUNC(GpuFsaConvolution2dValidate,
+ reasonIfUnsupported,
+ infos[0],
+ desc,
+ infos[2],
+ infos[3]);
+ }
+ }
+ case LayerType::Constant:
+ case LayerType::Input:
+ case LayerType::Output:
+ return IsGpuFsaBackendSupported(reasonIfUnsupported, infos[0]);
+ default:
+ // Layers not supported in the GpuFsa backend.
+ return false;
+ }
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaLayerSupport.hpp b/src/backends/gpuFsa/GpuFsaLayerSupport.hpp
new file mode 100644
index 0000000000..31177ec3c9
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaLayerSupport.hpp
@@ -0,0 +1,24 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backendsCommon/LayerSupportBase.hpp>
+#include <backendsCommon/LayerSupportRules.hpp>
+
+namespace armnn
+{
+
+class GpuFsaLayerSupport : public ILayerSupport
+{
+public:
+ bool IsLayerSupported(const LayerType& type,
+ const std::vector<TensorInfo>& infos,
+ const BaseDescriptor& descriptor,
+ const Optional<LstmInputParamsInfo>& lstmParamsInfo,
+ const Optional<QuantizedLstmInputParamsInfo>&,
+ Optional<std::string&> reasonIfUnsupported) const override;
+};
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp b/src/backends/gpuFsa/GpuFsaMemoryManager.cpp
new file mode 100644
index 0000000000..e16c02d18e
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaMemoryManager.cpp
@@ -0,0 +1,120 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "GpuFsaMemoryManager.hpp"
+#include "Exceptions.hpp"
+
+#include <algorithm>
+
+namespace armnn
+{
+
+GpuFsaMemoryManager::GpuFsaMemoryManager()
+{}
+
+GpuFsaMemoryManager::~GpuFsaMemoryManager()
+{}
+
+GpuFsaMemoryManager::Pool* GpuFsaMemoryManager::Manage(unsigned int numBytes)
+{
+ if (!m_FreePools.empty())
+ {
+ Pool* res = m_FreePools.back();
+ m_FreePools.pop_back();
+ res->Reserve(numBytes);
+ return res;
+ }
+ else
+ {
+ m_Pools.push_front(Pool(numBytes));
+ return &m_Pools.front();
+ }
+}
+
+void GpuFsaMemoryManager::Allocate(GpuFsaMemoryManager::Pool* pool)
+{
+ if (pool == nullptr)
+ {
+ throw armnn::MemoryValidationException(
+ "GpuFsaMemoryManager: Allocate: Attempting to allocate a null memory pool ptr");
+ }
+ m_FreePools.push_back(pool);
+}
+
+void* GpuFsaMemoryManager::GetPointer(GpuFsaMemoryManager::Pool* pool)
+{
+ return pool->GetPointer();
+}
+
+void GpuFsaMemoryManager::Acquire()
+{
+ for (Pool &pool: m_Pools)
+ {
+ pool.Acquire();
+ }
+}
+
+void GpuFsaMemoryManager::Release()
+{
+ for (Pool &pool: m_Pools)
+ {
+ pool.Release();
+ }
+}
+
+GpuFsaMemoryManager::Pool::Pool(unsigned int numBytes)
+ : m_Size(numBytes),
+ m_Pointer(nullptr)
+{}
+
+GpuFsaMemoryManager::Pool::~Pool()
+{
+ if (m_Pointer)
+ {
+ Release();
+ }
+}
+
+void* GpuFsaMemoryManager::Pool::GetPointer()
+{
+ if (m_Pointer == nullptr)
+ {
+ throw armnn::MemoryValidationException(
+ "GpuFsaMemoryManager::Pool::GetPointer() called when memory not acquired");
+ }
+ return m_Pointer;
+}
+
+void GpuFsaMemoryManager::Pool::Reserve(unsigned int numBytes)
+{
+ if (m_Pointer != nullptr)
+ {
+ throw armnn::MemoryValidationException(
+ "GpuFsaMemoryManager::Pool::Reserve() cannot be called after memory acquired");
+ }
+ m_Size = std::max(m_Size, numBytes);
+}
+
+void GpuFsaMemoryManager::Pool::Acquire()
+{
+ if (m_Pointer != nullptr)
+ {
+ throw armnn::MemoryValidationException(
+ "GpuFsaMemoryManager::Pool::Acquire() called when memory already acquired");
+ }
+ m_Pointer = ::operator new(size_t(m_Size));
+}
+
+void GpuFsaMemoryManager::Pool::Release()
+{
+ if (m_Pointer == nullptr)
+ {
+ throw armnn::MemoryValidationException(
+ "GpuFsaMemoryManager::Pool::Release() called when memory not acquired");
+ }
+ ::operator delete(m_Pointer);
+ m_Pointer = nullptr;
+}
+
+} \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp b/src/backends/gpuFsa/GpuFsaMemoryManager.hpp
new file mode 100644
index 0000000000..f68273a786
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaMemoryManager.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/backends/IMemoryManager.hpp>
+
+#include <forward_list>
+#include <vector>
+
+namespace armnn
+{
+
+// A dummy MemoryManager which will be deleted once the GpuFsa Backend is integrated with ClMemoryManager
+class GpuFsaMemoryManager : public IMemoryManager
+{
+public:
+ GpuFsaMemoryManager();
+ virtual ~GpuFsaMemoryManager();
+
+ class Pool;
+
+ Pool* Manage(unsigned int numBytes);
+
+ void Allocate(Pool *pool);
+
+ void* GetPointer(Pool *pool);
+
+ void Acquire() override;
+ void Release() override;
+
+ class Pool
+ {
+ public:
+ Pool(unsigned int numBytes);
+ ~Pool();
+
+ void Acquire();
+ void Release();
+
+ void* GetPointer();
+
+ void Reserve(unsigned int numBytes);
+
+ private:
+ unsigned int m_Size;
+ void* m_Pointer;
+ };
+
+private:
+ GpuFsaMemoryManager(const GpuFsaMemoryManager&) = delete; // Noncopyable
+ GpuFsaMemoryManager& operator=(const GpuFsaMemoryManager&) = delete; // Noncopyable
+
+ std::forward_list<Pool> m_Pools;
+ std::vector<Pool*> m_FreePools;
+};
+
+}
diff --git a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
new file mode 100644
index 0000000000..9efb300576
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaBackend.hpp"
+#include <armnn/BackendRegistry.hpp>
+
+namespace
+{
+using namespace armnn;
+static BackendRegistry::StaticRegistryInitializer g_RegisterHelper
+{
+ BackendRegistryInstance(),
+ GpuFsaBackend::GetIdStatic(),
+ []()
+ {
+ return IBackendInternalUniquePtr(new GpuFsaBackend);
+ }
+};
+} // Anonymous namespace \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp b/src/backends/gpuFsa/GpuFsaTensorHandle.cpp
new file mode 100644
index 0000000000..249b915ce1
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaTensorHandle.cpp
@@ -0,0 +1,188 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "GpuFsaTensorHandle.hpp"
+
+namespace armnn
+{
+GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo,
+ std::shared_ptr<GpuFsaMemoryManager>& memoryManager)
+ : m_TensorInfo(tensorInfo)
+ , m_MemoryManager(memoryManager)
+ , m_Pool(nullptr)
+ , m_UnmanagedMemory(nullptr)
+ , m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ , m_Imported(false)
+ , m_IsImportEnabled(false)
+{}
+
+GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo,
+ MemorySourceFlags importFlags)
+ : m_TensorInfo(tensorInfo)
+ , m_Pool(nullptr)
+ , m_UnmanagedMemory(nullptr)
+ , m_ImportFlags(importFlags)
+ , m_Imported(false)
+ , m_IsImportEnabled(true)
+{}
+
+GpuFsaTensorHandle::~GpuFsaTensorHandle()
+{
+ if (!m_Pool)
+ {
+ // unmanaged
+ if (!m_Imported)
+ {
+ ::operator delete(m_UnmanagedMemory);
+ }
+ }
+}
+
+void GpuFsaTensorHandle::Manage()
+{
+ if (!m_IsImportEnabled)
+ {
+ if (m_Pool == nullptr)
+ {
+ throw MemoryValidationException("GpuFsaTensorHandle::Manage() called twice");
+ }
+ if (m_UnmanagedMemory == nullptr)
+ {
+ throw MemoryValidationException("GpuFsaTensorHandle::Manage() called after Allocate()");
+ }
+
+ m_Pool = m_MemoryManager->Manage(m_TensorInfo.GetNumBytes());
+ }
+}
+
+void GpuFsaTensorHandle::Allocate()
+{
+ // If import is enabled, do not allocate the tensor
+ if (!m_IsImportEnabled)
+ {
+
+ if (!m_UnmanagedMemory)
+ {
+ if (!m_Pool)
+ {
+ // unmanaged
+ m_UnmanagedMemory = ::operator new(m_TensorInfo.GetNumBytes());
+ }
+ else
+ {
+ m_MemoryManager->Allocate(m_Pool);
+ }
+ }
+ else
+ {
+ throw InvalidArgumentException("GpuFsaTensorHandle::Allocate Trying to allocate a GpuFsaTensorHandle"
+ "that already has allocated memory.");
+ }
+ }
+}
+
+const void* GpuFsaTensorHandle::Map(bool /*unused*/) const
+{
+ return GetPointer();
+}
+
+void* GpuFsaTensorHandle::GetPointer() const
+{
+ if (m_UnmanagedMemory)
+ {
+ return m_UnmanagedMemory;
+ }
+ else if (m_Pool)
+ {
+ return m_MemoryManager->GetPointer(m_Pool);
+ }
+ else
+ {
+ throw NullPointerException("GpuFsaTensorHandle::GetPointer called on unmanaged, unallocated tensor handle");
+ }
+}
+
+void GpuFsaTensorHandle::CopyOutTo(void* dest) const
+{
+ const void *src = GetPointer();
+ if (src == nullptr)
+ {
+ throw MemoryValidationException("GpuFsaTensorhandle: CopyOutTo: Invalid memory src pointer");
+ }
+ memcpy(dest, src, m_TensorInfo.GetNumBytes());
+}
+
+void GpuFsaTensorHandle::CopyInFrom(const void* src)
+{
+ void *dest = GetPointer();
+ if (dest == nullptr)
+ {
+ throw MemoryValidationException("GpuFsaTensorhandle: CopyInFrom: Invalid memory dest pointer");
+ }
+ memcpy(dest, src, m_TensorInfo.GetNumBytes());
+}
+
+bool GpuFsaTensorHandle::Import(void* memory, MemorySource source)
+{
+ if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
+ {
+ if (m_IsImportEnabled && source == MemorySource::Malloc)
+ {
+ // Check memory alignment
+ if(!CanBeImported(memory, source))
+ {
+ if (m_Imported)
+ {
+ m_Imported = false;
+ m_UnmanagedMemory = nullptr;
+ }
+ return false;
+ }
+
+ // m_UnmanagedMemory not yet allocated.
+ if (!m_Imported && !m_UnmanagedMemory)
+ {
+ m_UnmanagedMemory = memory;
+ m_Imported = true;
+ return true;
+ }
+
+ // m_UnmanagedMemory initially allocated with Allocate().
+ if (!m_Imported && m_UnmanagedMemory)
+ {
+ return false;
+ }
+
+ // m_UnmanagedMemory previously imported.
+ if (m_Imported)
+ {
+ m_UnmanagedMemory = memory;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+bool GpuFsaTensorHandle::CanBeImported(void* memory, MemorySource source)
+{
+ if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
+ {
+ if (m_IsImportEnabled && source == MemorySource::Malloc)
+ {
+ uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType());
+ if (reinterpret_cast<uintptr_t>(memory) % alignment)
+ {
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+
+
+} \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
new file mode 100644
index 0000000000..d6901d1225
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
@@ -0,0 +1,361 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <aclCommon/ArmComputeTensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <armnn/utility/PolymorphicDowncast.hpp>
+#include <Half.hpp>
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/IMemoryGroup.h>
+#include <arm_compute/runtime/MemoryGroup.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Coordinates.h>
+
+#include <aclCommon/IClTensorHandle.hpp>
+
+namespace armnn
+{
+
+class GpuFsaTensorHandle : public IClTensorHandle
+{
+public:
+ GpuFsaTensorHandle(const TensorInfo& tensorInfo)
+ : m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Undefined)),
+ m_Imported(false),
+ m_IsImportEnabled(false)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo);
+ }
+
+ GpuFsaTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout,
+ MemorySourceFlags importFlags = static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ : m_ImportFlags(importFlags),
+ m_Imported(false),
+ m_IsImportEnabled(false)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout);
+ }
+
+ arm_compute::CLTensor& GetTensor() override { return m_Tensor; }
+ arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; }
+ virtual void Allocate() override
+ {
+ // If we have enabled Importing, don't allocate the tensor
+ if (m_IsImportEnabled)
+ {
+ throw MemoryImportException("GpuFsaTensorHandle::Attempting to allocate memory when importing");
+ }
+ else
+ {
+ armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);
+ }
+
+ }
+
+ virtual void Manage() override
+ {
+ // If we have enabled Importing, don't manage the tensor
+ if (m_IsImportEnabled)
+ {
+ throw MemoryImportException("GpuFsaTensorHandle::Attempting to manage memory when importing");
+ }
+ else
+ {
+ assert(m_MemoryGroup != nullptr);
+ m_MemoryGroup->manage(&m_Tensor);
+ }
+ }
+
+ virtual const void* Map(bool blocking = true) const override
+ {
+ const_cast<arm_compute::CLTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+
+ virtual void Unmap() const override { const_cast<arm_compute::CLTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle* GetParent() const override { return nullptr; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
+
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override
+ {
+ m_MemoryGroup = PolymorphicPointerDowncast<arm_compute::MemoryGroup>(memoryGroup);
+ }
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+
+ void SetImportFlags(MemorySourceFlags importFlags)
+ {
+ m_ImportFlags = importFlags;
+ }
+
+ MemorySourceFlags GetImportFlags() const override
+ {
+ return m_ImportFlags;
+ }
+
+ void SetImportEnabledFlag(bool importEnabledFlag)
+ {
+ m_IsImportEnabled = importEnabledFlag;
+ }
+
+ virtual bool Import(void* /*memory*/, MemorySource source) override
+ {
+ if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
+ {
+ throw MemoryImportException("GpuFsaTensorHandle::Incorrect import flag");
+ }
+ m_Imported = false;
+ return false;
+ }
+
+ virtual bool CanBeImported(void* /*memory*/, MemorySource /*source*/) override
+ {
+ // This TensorHandle can never import.
+ return false;
+ }
+
+private:
+ // Only used for testing
+ void CopyOutTo(void* memory) const override
+ {
+ const_cast<armnn::GpuFsaTensorHandle*>(this)->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<float*>(memory));
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<uint8_t*>(memory));
+ break;
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int8_t*>(memory));
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<armnn::Half*>(memory));
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int16_t*>(memory));
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int32_t*>(memory));
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ const_cast<armnn::GpuFsaTensorHandle*>(this)->Unmap();
+ }
+
+ // Only used for testing
+ void CopyInFrom(const void* memory) override
+ {
+ this->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const float*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+ this->GetTensor());
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ this->Unmap();
+ }
+
+ arm_compute::CLTensor m_Tensor;
+ std::shared_ptr<arm_compute::MemoryGroup> m_MemoryGroup;
+ MemorySourceFlags m_ImportFlags;
+ bool m_Imported;
+ bool m_IsImportEnabled;
+};
+
+class GpuFsaSubTensorHandle : public IClTensorHandle
+{
+public:
+ GpuFsaSubTensorHandle(IClTensorHandle* parent,
+ const arm_compute::TensorShape& shape,
+ const arm_compute::Coordinates& coords)
+ : m_Tensor(&parent->GetTensor(), shape, coords)
+ {
+ parentHandle = parent;
+ }
+
+ arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; }
+ arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; }
+
+ virtual void Allocate() override {}
+ virtual void Manage() override {}
+
+ virtual const void* Map(bool blocking = true) const override
+ {
+ const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override { const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle* GetParent() const override { return parentHandle; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
+
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+
+private:
+ // Only used for testing
+ void CopyOutTo(void* memory) const override
+ {
+ const_cast<GpuFsaSubTensorHandle*>(this)->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<float*>(memory));
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<uint8_t*>(memory));
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<armnn::Half*>(memory));
+ break;
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int8_t*>(memory));
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int16_t*>(memory));
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+ static_cast<int32_t*>(memory));
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ const_cast<GpuFsaSubTensorHandle*>(this)->Unmap();
+ }
+
+ // Only used for testing
+ void CopyInFrom(const void* memory) override
+ {
+ this->Map(true);
+ switch(this->GetDataType())
+ {
+ case arm_compute::DataType::F32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const float*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::U8:
+ case arm_compute::DataType::QASYMM8:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::F16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::QSYMM8:
+ case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+ case arm_compute::DataType::QASYMM8_SIGNED:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S16:
+ case arm_compute::DataType::QSYMM16:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
+ this->GetTensor());
+ break;
+ case arm_compute::DataType::S32:
+ armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+ this->GetTensor());
+ break;
+ default:
+ {
+ throw armnn::UnimplementedException();
+ }
+ }
+ this->Unmap();
+ }
+
+ mutable arm_compute::CLSubTensor m_Tensor;
+ ITensorHandle* parentHandle = nullptr;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
new file mode 100644
index 0000000000..c1a34d24e5
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
@@ -0,0 +1,112 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaTensorHandle.hpp"
+#include "GpuFsaTensorHandleFactory.hpp"
+
+namespace armnn
+{
+
+using FactoryId = ITensorHandleFactory::FactoryId;
+
+std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateSubTensorHandle(ITensorHandle& parent,
+ const TensorShape& subTensorShape,
+ const unsigned int* subTensorOrigin) const
+{
+ arm_compute::Coordinates coords;
+ arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape);
+
+ coords.set_num_dimensions(subTensorShape.GetNumDimensions());
+ for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); ++i)
+ {
+ // Arm compute indexes tensor coords in reverse order.
+ unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1;
+ coords.set(i, armnn::numeric_cast<int>(subTensorOrigin[revertedIndex]));
+ }
+
+ const arm_compute::TensorShape parentShape = armcomputetensorutils::BuildArmComputeTensorShape(parent.GetShape());
+
+ // In order for ACL to support subtensors the concat axis cannot be on x or y and the values of x and y
+ // must match the parent shapes
+ if (coords.x() != 0 || coords.y() != 0)
+ {
+ return nullptr;
+ }
+ if ((parentShape.x() != shape.x()) || (parentShape.y() != shape.y()))
+ {
+ return nullptr;
+ }
+
+ if (!::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, parentShape, coords, shape))
+ {
+ return nullptr;
+ }
+
+ return std::make_unique<GpuFsaSubTensorHandle>(PolymorphicDowncast<IClTensorHandle*>(&parent), shape, coords);
+}
+
+std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
+{
+ return GpuFsaTensorHandleFactory::CreateTensorHandle(tensorInfo, true);
+}
+
+std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const
+{
+ return GpuFsaTensorHandleFactory::CreateTensorHandle(tensorInfo, dataLayout, true);
+}
+
+std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ const bool IsMemoryManaged) const
+{
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo);
+ if (!IsMemoryManaged)
+ {
+ ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed.";
+ }
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+ return tensorHandle;
+}
+
+std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout,
+ const bool IsMemoryManaged) const
+{
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, dataLayout);
+ if (!IsMemoryManaged)
+ {
+ ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed.";
+ }
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+ return tensorHandle;
+}
+
+const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic()
+{
+ static const FactoryId s_Id(GpuFsaTensorHandleFactoryId());
+ return s_Id;
+}
+
+const FactoryId& GpuFsaTensorHandleFactory::GetId() const
+{
+ return GetIdStatic();
+}
+
+bool GpuFsaTensorHandleFactory::SupportsSubTensors() const
+{
+ return true;
+}
+
+MemorySourceFlags GpuFsaTensorHandleFactory::GetExportFlags() const
+{
+ return MemorySourceFlags(MemorySource::Undefined);
+}
+
+MemorySourceFlags GpuFsaTensorHandleFactory::GetImportFlags() const
+{
+ return MemorySourceFlags(MemorySource::Undefined);
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
new file mode 100644
index 0000000000..93a44259f6
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
@@ -0,0 +1,55 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/backends/ITensorHandleFactory.hpp>
+
+#include <aclCommon/BaseMemoryManager.hpp>
+
+namespace armnn
+{
+
+constexpr const char * GpuFsaTensorHandleFactoryId() { return "Arm/GpuFsa/TensorHandleFactory"; }
+
+class GpuFsaTensorHandleFactory : public ITensorHandleFactory
+{
+
+public:
+ GpuFsaTensorHandleFactory(std::shared_ptr<GpuFsaMemoryManager> mgr)
+ : m_MemoryManager(mgr)
+ {}
+
+ std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& parent,
+ TensorShape const& subTensorShape,
+ unsigned int const* subTensorOrigin) const override;
+
+ std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo) const override;
+
+ std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const override;
+
+ std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
+ const bool IsMemoryManaged) const override;
+
+ std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout,
+ const bool IsMemoryManaged) const override;
+
+ static const FactoryId& GetIdStatic();
+
+ const FactoryId& GetId() const override;
+
+ bool SupportsSubTensors() const override;
+
+ MemorySourceFlags GetExportFlags() const override;
+
+ MemorySourceFlags GetImportFlags() const override;
+
+private:
+ mutable std::shared_ptr<GpuFsaMemoryManager> m_MemoryManager;
+
+};
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
new file mode 100644
index 0000000000..6d13879f51
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
@@ -0,0 +1,91 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <Layer.hpp>
+
+#include "GpuFsaWorkloadFactory.hpp"
+#include "GpuFsaBackendId.hpp"
+#include "GpuFsaTensorHandle.hpp"
+
+namespace armnn
+{
+
+namespace
+{
+static const BackendId s_Id{GpuFsaBackendId()};
+}
+template <typename QueueDescriptorType>
+std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& /*descriptor*/,
+ const WorkloadInfo& /*info*/) const
+{
+ return nullptr;
+}
+
+template <DataType ArmnnType>
+bool IsDataType(const WorkloadInfo& info)
+{
+ auto checkType = [](const TensorInfo& tensorInfo) {return tensorInfo.GetDataType() == ArmnnType;};
+ auto it = std::find_if(std::begin(info.m_InputTensorInfos), std::end(info.m_InputTensorInfos), checkType);
+ if (it != std::end(info.m_InputTensorInfos))
+ {
+ return true;
+ }
+ it = std::find_if(std::begin(info.m_OutputTensorInfos), std::end(info.m_OutputTensorInfos), checkType);
+ if (it != std::end(info.m_OutputTensorInfos))
+ {
+ return true;
+ }
+ return false;
+}
+
+GpuFsaWorkloadFactory::GpuFsaWorkloadFactory(const std::shared_ptr<GpuFsaMemoryManager>& memoryManager)
+ : m_MemoryManager(memoryManager)
+{
+}
+
+GpuFsaWorkloadFactory::GpuFsaWorkloadFactory()
+ : m_MemoryManager(new GpuFsaMemoryManager())
+{
+}
+
+const BackendId& GpuFsaWorkloadFactory::GetBackendId() const
+{
+ return s_Id;
+}
+
+bool GpuFsaWorkloadFactory::IsLayerSupported(const Layer& layer,
+ Optional<DataType> dataType,
+ std::string& outReasonIfUnsupported)
+{
+ return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported);
+}
+
+std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ const bool /*isMemoryManaged*/) const
+{
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo);
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+
+ return tensorHandle;
+}
+
+std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout,
+ const bool /*isMemoryManaged*/) const
+{
+ std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, dataLayout);
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+
+ return tensorHandle;
+}
+
+std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType /*type*/,
+ const QueueDescriptor& /*descriptor*/,
+ const WorkloadInfo& /*info*/) const
+{
+ return nullptr;
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
new file mode 100644
index 0000000000..9b97070766
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <armnn/Optional.hpp>
+
+namespace armnn
+{
+
+// Dynamic Fusion workload factory.
+class GpuFsaWorkloadFactory : public IWorkloadFactory
+{
+public:
+ explicit GpuFsaWorkloadFactory(const std::shared_ptr<GpuFsaMemoryManager>& memoryManager);
+ GpuFsaWorkloadFactory();
+
+ ~GpuFsaWorkloadFactory() {}
+
+ const BackendId& GetBackendId() const override;
+
+ static bool IsLayerSupported(const Layer& layer,
+ Optional<DataType> dataType,
+ std::string& outReasonIfUnsupported);
+
+ bool SupportsSubTensors() const override { return false; }
+
+ ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateSubTensorHandle instead")
+ std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& /*parent*/,
+ TensorShape const& /*subTensorShape*/,
+ unsigned int const* /*subTensorOrigin*/) const override
+ {
+ return nullptr;
+ }
+
+ ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateTensorHandle instead")
+ std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
+ const bool IsMemoryManaged = true) const override;
+
+ ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateTensorHandle instead")
+ std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout,
+ const bool IsMemoryManaged = true) const override;
+
+ std::unique_ptr<IWorkload> CreateWorkload(LayerType type,
+ const QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+private:
+ template <typename QueueDescriptorType>
+ std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const;
+
+ mutable std::shared_ptr<GpuFsaMemoryManager> m_MemoryManager;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/backend.cmake b/src/backends/gpuFsa/backend.cmake
new file mode 100644
index 0000000000..16473336e0
--- /dev/null
+++ b/src/backends/gpuFsa/backend.cmake
@@ -0,0 +1,15 @@
+#
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/gpuFsa)
+list(APPEND armnnLibraries armnnGpuFsaBackend)
+
+if(ARMCOMPUTEGPUFSA)
+ list(APPEND armnnLibraries armnnGpuFsaBackendLayerValidators)
+ list(APPEND armnnLibraries armnnGpuFsaBackendWorkloads)
+ list(APPEND armnnUnitTestLibraries armnnGpuFsaBackendUnitTests)
+else()
+ message(STATUS "GPU Dynamic Fusion backend is disabled")
+endif()
diff --git a/src/backends/gpuFsa/backend.mk b/src/backends/gpuFsa/backend.mk
new file mode 100644
index 0000000000..d8d254205b
--- /dev/null
+++ b/src/backends/gpuFsa/backend.mk
@@ -0,0 +1,58 @@
+#
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+# BACKEND_SOURCES contains the list of files to be included
+# in the Android build and it is picked up by the Android.mk
+# file in the root of ArmNN
+
+# The variable to enable/disable the GPU Dynamic Fusion backend
+# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk)
+ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1)
+
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 1
+# Include the source files for the GPU Dynamic Fusion backend
+
+BACKEND_SOURCES := \
+ GpuFsaBackend.cpp \
+ GpuFsaBackendContext.cpp \
+ GpuFsaContextControl.cpp \
+ GpuFsaLayerSupport.cpp \
+ GpuFsaRegistryInitializer.cpp \
+ GpuFsaTensorHandleFactory.cpp \
+ GpuFsaWorkloadFactory.cpp \
+ layerValidators/GpuFsaConvolution2dValidate.cpp
+else
+
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
+# No source file will be compiled for the GPU Dynamic Fusion backend
+
+BACKEND_SOURCES :=
+
+endif
+
+# BACKEND_TEST_SOURCES contains the list of files to be included
+# in the Android unit test build (armnn-tests) and it is picked
+# up by the Android.mk file in the root of ArmNN
+
+# The variable to enable/disable the GPU Dynamic Fusion backend
+# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk)
+ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1)
+
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 1
+# Include the source files for the GPU Dynamic Fusion backend tests
+
+BACKEND_TEST_SOURCES := \
+ test/GpuFsaEndToEndTests.cpp \
+ test/GpuFsaLayerSupportTests.cpp \
+ test/GpuFsaLayerTests.cpp \
+ test/GpuFsaOptimizedNetworkTests.cpp
+else
+
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
+# No source file will be compiled for the GPU Dynamic Fusion backend tests
+
+BACKEND_TEST_SOURCES :=
+
+endif
diff --git a/src/backends/gpuFsa/layerValidators/CMakeLists.txt b/src/backends/gpuFsa/layerValidators/CMakeLists.txt
new file mode 100644
index 0000000000..57ea41d56c
--- /dev/null
+++ b/src/backends/gpuFsa/layerValidators/CMakeLists.txt
@@ -0,0 +1,14 @@
+#
+# Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+list(APPEND armnnGpuFsaBackendLayerValidators_sources
+ GpuFsaConvolution2dValidate.cpp
+ GpuFsaConvolution2dValidate.hpp
+ )
+
+add_library(armnnGpuFsaBackendLayerValidators OBJECT ${armnnGpuFsaBackendLayerValidators_sources})
+target_include_directories(armnnGpuFsaBackendLayerValidators PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnGpuFsaBackendLayerValidators PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
+target_include_directories(armnnGpuFsaBackendLayerValidators PRIVATE ${PROJECT_SOURCE_DIR}/src/backends)
diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp
new file mode 100644
index 0000000000..bed7b26f74
--- /dev/null
+++ b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.cpp
@@ -0,0 +1,126 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaConvolution2dValidate.hpp"
+
+#include <armnn/Types.hpp>
+#include <armnn/utility/IgnoreUnused.hpp>
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/core/ITensorInfo.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/core/CL/CLCompileContext.h>
+
+#include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h>
+
+#include <vector>
+#include <iostream>
+
+namespace armnn
+{
+
+using namespace armcomputetensorutils;
+
+inline arm_compute::Status ValidateAndCreateOp(const TensorInfo& input,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const Optional<TensorInfo>& biases,
+ const bool createOp = false)
+{
+ // Create a new workload sketch, for validation purposes
+ auto compileCtx = arm_compute::CLKernelLibrary::get().get_compile_context();
+ auto gpuCtx = GpuWorkloadContext(&compileCtx);
+ GpuWorkloadSketch sketch{ &gpuCtx };
+
+ // Build and create tensor infos using the sketch
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+ aclWeightsInfo.set_are_values_constant(weights.IsConstant());
+
+ auto inputInfo = gpuCtx.create_tensor_info(aclInputInfo);
+ auto weightInfo = gpuCtx.create_tensor_info(aclWeightsInfo);
+
+ // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op
+ arm_compute::TensorInfo aclBiasInfo;
+ arm_compute::TensorInfo biasSketchInfo;
+ arm_compute::TensorInfo* biasSketchInfoPtr = nullptr;
+
+ if (descriptor.m_BiasEnabled)
+ {
+ if(!biases.has_value())
+ {
+ throw InvalidArgumentException("GpuFsaConvolution2dValidate: No biases set when biases are enabled");
+ }
+ aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout);
+ aclBiasInfo.set_are_values_constant(biases.value().IsConstant());
+
+ biasSketchInfo = gpuCtx.create_tensor_info(aclBiasInfo);
+ biasSketchInfoPtr = &biasSketchInfo;
+ }
+
+ // Set Conv2d attributes using descriptor
+ const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(descriptor.m_DilationX,
+ descriptor.m_DilationY);
+ const arm_compute::Padding2D aclPadInfo = BuildArmComputePaddingInfo(descriptor);
+ const arm_compute::Size2D aclStrideInfo = BuildArmComputeSize2D(descriptor.m_StrideX, descriptor.m_StrideY);
+
+ Conv2dAttributes conv2DAttributes{};
+ conv2DAttributes.dilation(aclDilationInfo);
+ conv2DAttributes.pad(aclPadInfo);
+ conv2DAttributes.stride(aclStrideInfo);
+
+ // Validate operator, check status and update reasonIfUnsupported
+ arm_compute::Status aclStatus = GpuConv2d::validate_op(sketch,
+ &inputInfo,
+ &weightInfo,
+ biasSketchInfoPtr,
+ conv2DAttributes);
+
+ if (createOp)
+ {
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported)
+ {
+ throw BackendCapabilityException("\"GpuFsa\" backend failed during operation validation when attempting "
+ "to fuse a GpuConv2d operator into the existing workload sketch.");
+ }
+
+ arm_compute::ITensorInfo* convOutInfo = GpuConv2d::create_op(sketch,
+ &inputInfo,
+ &weightInfo,
+ biasSketchInfoPtr,
+ conv2DAttributes);
+
+ // Temporary fix until fusing attempt is make for GpuFsa backend and Output layer workload is created.
+ auto outputInfo = gpuCtx.create_tensor_info();
+ GpuOutput::create_op(sketch, convOutInfo, &outputInfo);
+ }
+
+ return aclStatus;
+}
+
+arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const Optional<TensorInfo>& biases)
+{
+ return ValidateAndCreateOp(input, descriptor, weights, biases);
+}
+
+void GpuFsaConvolution2dCreateOp(const TensorInfo& input,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const Optional<TensorInfo>& biases)
+{
+ ValidateAndCreateOp(input, descriptor, weights, biases, true);
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp
new file mode 100644
index 0000000000..120060e8ad
--- /dev/null
+++ b/src/backends/gpuFsa/layerValidators/GpuFsaConvolution2dValidate.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/Descriptors.hpp>
+#include <armnn/Tensor.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
+
+namespace armnn
+{
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const Optional<TensorInfo>& biases);
+
+void GpuFsaConvolution2dCreateOp(const TensorInfo& input,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const Optional<TensorInfo>& biases);
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/test/CMakeLists.txt b/src/backends/gpuFsa/test/CMakeLists.txt
new file mode 100644
index 0000000000..66091e90df
--- /dev/null
+++ b/src/backends/gpuFsa/test/CMakeLists.txt
@@ -0,0 +1,19 @@
+#
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+list(APPEND armnnGpuFsaBackendUnitTests_sources
+ GpuFsaDefaultAllocatorTests.cpp
+ GpuFsaEndToEndTests.cpp
+ GpuFsaLayerTests.cpp
+ GpuFsaLayerSupportTests.cpp
+ GpuFsaOptimizedNetworkTests.cpp
+)
+
+add_library(armnnGpuFsaBackendUnitTests OBJECT ${armnnGpuFsaBackendUnitTests_sources})
+target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
+target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnTestUtils)
+target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/backends)
+target_include_directories(armnnGpuFsaBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/third-party/doctest)
diff --git a/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp
new file mode 100644
index 0000000000..17d5952217
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp
@@ -0,0 +1,193 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/backends/ICustomAllocator.hpp>
+#include <armnn/BackendRegistry.hpp>
+#include <armnn/Descriptors.hpp>
+#include <armnn/Exceptions.hpp>
+#include <armnn/IRuntime.hpp>
+#include <armnn/backends/TensorHandle.hpp>
+// Requires the OpenCl backend to be included (GpuFsa)
+#include <gpuFsa/GpuFsaBackend.hpp>
+#include <doctest/doctest.h>
+#include <backendsCommon/DefaultAllocator.hpp>
+#include <armnnTestUtils/MockBackend.hpp>
+#include <gpuFsa/GpuFsaBackendDefaultAllocator.hpp>
+
+using namespace armnn;
+
+namespace
+{
+
+TEST_SUITE("DefaultAllocatorTests")
+{
+
+TEST_CASE("DefaultAllocatorTest")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<DefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("DefaultAllocatorTestMulti")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<DefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+ void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto* inputPtr2 = reinterpret_cast<float*>(alignedInputPtr2);
+ std::fill_n(inputPtr2, numElements, number);
+ CHECK(inputPtr2[0] == 3);
+ CHECK(inputPtr2[1] == 3);
+
+ // No overlap
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("DefaultAllocatorTestMock")
+{
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Initialize Mock Backend
+ MockBackendInitialiser initialiser;
+ auto factoryFun = BackendRegistryInstance().GetFactory(MockBackend().GetIdStatic());
+ CHECK(factoryFun != nullptr);
+ auto backend = factoryFun();
+ auto defaultAllocator = backend->GetDefaultAllocator();
+
+ // GetMemorySourceType
+ CHECK(defaultAllocator->GetMemorySourceType() == MemorySource::Malloc);
+
+ size_t totalBytes = 1 * sizeof(float);
+ // Allocate
+ void* ptr = defaultAllocator->allocate(totalBytes, 0);
+
+ // GetMemoryRegionAtOffset
+ CHECK(defaultAllocator->GetMemoryRegionAtOffset(ptr, 0, 0));
+
+ // Free
+ defaultAllocator->free(ptr);
+
+ // Clean up
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.Deregister(MockBackend().GetIdStatic());
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+}
+
+
+TEST_SUITE("GpuFsaDefaultAllocatorTests")
+{
+
+TEST_CASE("GpuFsaDefaultAllocatorTest")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<GpuFsaBackendDefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("GpuFsaDefaultAllocatorTestMulti")
+{
+ float number = 3;
+
+ TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32);
+
+ // Create ArmNN runtime
+ IRuntime::CreationOptions options; // default options
+ auto customAllocator = std::make_shared<GpuFsaBackendDefaultAllocator>();
+ options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+ IRuntimePtr run = IRuntime::Create(options);
+
+ // Creates structures for input & output
+ unsigned int numElements = inputTensorInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+ void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ std::fill_n(inputPtr, numElements, number);
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto* inputPtr2 = reinterpret_cast<float*>(alignedInputPtr2);
+ std::fill_n(inputPtr2, numElements, number);
+ CHECK(inputPtr2[0] == 3);
+ CHECK(inputPtr2[1] == 3);
+
+ // No overlap
+ CHECK(inputPtr[0] == 3);
+ CHECK(inputPtr[1] == 3);
+
+ auto& backendRegistry = armnn::BackendRegistryInstance();
+ backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+}
+
+} // namespace armnn \ No newline at end of file
diff --git a/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp
new file mode 100644
index 0000000000..1d6b99a31f
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaEndToEndTests.cpp
@@ -0,0 +1,8 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "backendsCommon/test/EndToEndTestImpl.hpp"
+
+#include <doctest/doctest.h> \ No newline at end of file
diff --git a/src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp b/src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp
new file mode 100644
index 0000000000..f162df0b55
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaLayerSupportTests.cpp
@@ -0,0 +1,64 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/Optional.hpp>
+#include <armnn/Types.hpp>
+
+#include <gpuFsa/GpuFsaLayerSupport.hpp>
+
+#include <doctest/doctest.h>
+
+#include <iostream>
+
+using namespace armnn;
+
+TEST_SUITE("GpuFsaLayerSupport")
+{
+
+TEST_CASE("IsLayerSupportedGpuFsaConv2d")
+{
+ TensorInfo inputInfo ({ 1, 5, 5, 1 }, DataType::Float32);
+ TensorInfo outputInfo({ 1, 3, 3, 1 }, DataType::Float32);
+ TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true);
+ TensorInfo biasesInfo ({ 1 }, DataType::Float32, 0.0f, 0, true);
+
+ Convolution2dDescriptor desc;
+ desc.m_BiasEnabled = true;
+ desc.m_DataLayout = DataLayout::NHWC;
+
+ GpuFsaLayerSupport supportChecker;
+ std::string reasonIfNotSupported;
+ auto supported = supportChecker.IsLayerSupported(LayerType::Convolution2d,
+ {inputInfo, outputInfo, weightsInfo, biasesInfo},
+ desc,
+ EmptyOptional(),
+ EmptyOptional(),
+ reasonIfNotSupported);
+ CHECK(supported);
+}
+
+TEST_CASE("IsLayerSupportedGpuFsaConv2dUnsupported")
+{
+ TensorInfo inputInfo ({ 1, 5, 5, 1 }, DataType::Float32);
+ TensorInfo outputInfo({ 1, 3, 3, 1 }, DataType::Float32);
+ TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true);
+
+ // NCHW is unsupported.
+ Convolution2dDescriptor desc;
+ desc.m_DataLayout = DataLayout::NCHW;
+
+ GpuFsaLayerSupport supportChecker;
+ std::string reasonIfNotSupported;
+ auto supported = supportChecker.IsLayerSupported(LayerType::Convolution2d,
+ {inputInfo, outputInfo, weightsInfo, TensorInfo()},
+ desc,
+ EmptyOptional(),
+ EmptyOptional(),
+ reasonIfNotSupported);
+ CHECK(!supported);
+ REQUIRE(reasonIfNotSupported.find("NCHW not supported by this kernel") != std::string::npos);
+}
+
+} \ No newline at end of file
diff --git a/src/backends/gpuFsa/test/GpuFsaLayerTests.cpp b/src/backends/gpuFsa/test/GpuFsaLayerTests.cpp
new file mode 100644
index 0000000000..e032922d17
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaLayerTests.cpp
@@ -0,0 +1,12 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaWorkloadFactoryHelper.hpp"
+
+#include <backendsCommon/test/LayerTests.hpp>
+
+#include <gpuFsa/GpuFsaWorkloadFactory.hpp>
+
+#include <UnitTests.hpp> \ No newline at end of file
diff --git a/src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp b/src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp
new file mode 100644
index 0000000000..7e094cec1e
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaOptimizedNetworkTests.cpp
@@ -0,0 +1,137 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/INetwork.hpp>
+
+#include <GraphUtils.hpp>
+#include <TestUtils.hpp>
+
+#include <doctest/doctest.h>
+
+using namespace armnn;
+
+TEST_SUITE("GpuFsaOptimizedNetwork")
+{
+
+TEST_CASE("SingleConv2dSupportedOptimizedNetwork")
+{
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+ INetworkPtr network(INetwork::Create());
+
+ TensorInfo inputInfo({ 1, 5, 5, 1 }, DataType::Float32);
+ TensorInfo outputInfo({ 1, 3, 3, 1 }, DataType::Float32);
+ TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true);
+ TensorInfo biasesInfo({ 1 }, DataType::Float32, 0.0f, 0, true);
+
+ Convolution2dDescriptor desc;
+ desc.m_BiasEnabled = true;
+ desc.m_DataLayout = DataLayout::NHWC;
+
+ auto inputLayer = network->AddInputLayer(0, "input");
+ auto weightLayer = network->AddConstantLayer(ConstTensor(weightsInfo, nullptr), "weights");
+ auto biasLayer = network->AddConstantLayer(ConstTensor(biasesInfo, nullptr), "bias");
+ auto convLayer = network->AddConvolution2dLayer(desc, "conv2d");
+ auto outputLayer = network->AddOutputLayer(1, "output");
+
+ inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
+ inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+ weightLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1));
+ weightLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);
+
+ biasLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(2));
+ biasLayer->GetOutputSlot(0).SetTensorInfo(biasesInfo);
+
+ convLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+ convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
+
+ std::vector<BackendId> backends = { "GpuFsa" };
+
+ OptimizerOptionsOpaque optimizedOptions;
+ IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optimizedOptions);
+ CHECK(optNet);
+
+ Graph& graph = GetGraphForTesting(optNet.get());
+
+ // Check graph layer sequence to ensure that the network has been replaced with a PreCompiledLayer
+ CHECK(CheckSequence(graph.cbegin(), graph.cend(),
+ &IsLayerOfType<InputLayer>,
+ &IsLayerOfType<ConstantLayer>,
+ &IsLayerOfType<ConstantLayer>,
+ &IsLayerOfType<PreCompiledLayer>,
+ &IsLayerOfType<OutputLayer>));
+}
+
+TEST_CASE("TwoConv2dSupportedOptimizedNetwork")
+{
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+ INetworkPtr network(INetwork::Create());
+
+ TensorInfo inputInfo({ 1, 5, 5, 1 }, DataType::Float32);
+ TensorInfo intermediateInfo({ 1, 3, 3, 1 }, DataType::Float32);
+ TensorInfo outputInfo({ 1, 1, 1, 1 }, DataType::Float32);
+ TensorInfo weightsInfo({ 1, 3, 3, 1 }, DataType::Float32, 0.0f, 0, true);
+ TensorInfo biasesInfo({ 1 }, DataType::Float32, 0.0f, 0, true);
+
+ Convolution2dDescriptor desc;
+ desc.m_BiasEnabled = true;
+ desc.m_DataLayout = DataLayout::NHWC;
+
+ auto inputLayer = network->AddInputLayer(0, "input");
+
+ auto weightLayer1 = network->AddConstantLayer(ConstTensor(weightsInfo, nullptr), "weights");
+ auto biasLayer1 = network->AddConstantLayer(ConstTensor(biasesInfo, nullptr), "bias");
+ auto convLayer1 = network->AddConvolution2dLayer(desc, "conv2d");
+
+ auto weightLayer2 = network->AddConstantLayer(ConstTensor(weightsInfo, nullptr), "weights");
+ auto biasLayer2 = network->AddConstantLayer(ConstTensor(biasesInfo, nullptr), "bias");
+ auto convLayer2 = network->AddConvolution2dLayer(desc, "conv2d");
+
+ auto outputLayer = network->AddOutputLayer(0, "output");
+
+ inputLayer->GetOutputSlot(0).Connect(convLayer1->GetInputSlot(0));
+ inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+ weightLayer1->GetOutputSlot(0).Connect(convLayer1->GetInputSlot(1));
+ weightLayer1->GetOutputSlot(0).SetTensorInfo(weightsInfo);
+
+ biasLayer1->GetOutputSlot(0).Connect(convLayer1->GetInputSlot(2));
+ biasLayer1->GetOutputSlot(0).SetTensorInfo(biasesInfo);
+
+ convLayer1->GetOutputSlot(0).Connect(convLayer2->GetInputSlot(0));
+ convLayer1->GetOutputSlot(0).SetTensorInfo(intermediateInfo);
+
+ weightLayer2->GetOutputSlot(0).Connect(convLayer2->GetInputSlot(1));
+ weightLayer2->GetOutputSlot(0).SetTensorInfo(weightsInfo);
+
+ biasLayer2->GetOutputSlot(0).Connect(convLayer2->GetInputSlot(2));
+ biasLayer2->GetOutputSlot(0).SetTensorInfo(biasesInfo);
+
+ convLayer2->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
+ convLayer2->GetOutputSlot(0).SetTensorInfo(outputInfo);
+
+ std::vector<BackendId> backends = { "GpuFsa" };
+
+ OptimizerOptionsOpaque optimizedOptions;
+ IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optimizedOptions);
+ CHECK(optNet);
+
+ Graph& graph = GetGraphForTesting(optNet.get());
+
+ // Check graph layer sequence to ensure that the network has been replaced with a PreCompiledLayer
+ CHECK(CheckSequence(graph.cbegin(), graph.cend(),
+ &IsLayerOfType<InputLayer>,
+ &IsLayerOfType<ConstantLayer>,
+ &IsLayerOfType<ConstantLayer>,
+ &IsLayerOfType<ConstantLayer>,
+ &IsLayerOfType<ConstantLayer>,
+ &IsLayerOfType<PreCompiledLayer>,
+ &IsLayerOfType<PreCompiledLayer>,
+ &IsLayerOfType<OutputLayer>));
+}
+
+} \ No newline at end of file
diff --git a/src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp b/src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp
new file mode 100644
index 0000000000..c1d75d625b
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaWorkloadFactoryHelper.hpp
@@ -0,0 +1,45 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/test/WorkloadFactoryHelper.hpp>
+
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
+#include <gpuFsa/GpuFsaBackend.hpp>
+#include <gpuFsa/GpuFsaWorkloadFactory.hpp>
+#include "gpuFsa/GpuFsaTensorHandleFactory.hpp"
+
+namespace
+{
+
+template<>
+struct WorkloadFactoryHelper<armnn::GpuFsaWorkloadFactory>
+{
+ static armnn::IBackendInternal::IMemoryManagerSharedPtr GetMemoryManager()
+ {
+ armnn::GpuFsaBackend backend;
+ return backend.CreateMemoryManager();
+ }
+
+ static armnn::GpuFsaWorkloadFactory GetFactory(
+ const armnn::IBackendInternal::IMemoryManagerSharedPtr&)
+ {
+ return armnn::GpuFsaWorkloadFactory();
+ }
+
+ static armnn::GpuFsaTensorHandleFactory GetTensorHandleFactory(
+ const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr)
+ {
+
+ return armnn::GpuFsaTensorHandleFactory(
+ armnn::PolymorphicPointerDowncast<armnn::GpuFsaMemoryManager>(memoryManager));
+ }
+};
+
+using GpuFsaWorkloadFactoryHelper = WorkloadFactoryHelper<armnn::GpuFsaWorkloadFactory>;
+
+} // anonymous namespace
diff --git a/src/backends/gpuFsa/workloads/CMakeLists.txt b/src/backends/gpuFsa/workloads/CMakeLists.txt
new file mode 100644
index 0000000000..4d100123ea
--- /dev/null
+++ b/src/backends/gpuFsa/workloads/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+list(APPEND armnnGpuFsaBackendWorkloads_sources
+ GpuFsaBaseWorkload.hpp
+)
+
+add_library(armnnGpuFsaBackendWorkloads OBJECT ${armnnGpuFsaBackendWorkloads_sources})
+target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
+target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/backends)
+target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/profiling)
+target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/profiling/common/include)
+target_include_directories(armnnGpuFsaBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/profiling/client/include)
diff --git a/src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp b/src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp
new file mode 100644
index 0000000000..c274e14665
--- /dev/null
+++ b/src/backends/gpuFsa/workloads/GpuFsaBaseWorkload.hpp
@@ -0,0 +1,39 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/backends/Workload.hpp>
+
+namespace armnn
+{
+
+template <typename QueueDescriptor>
+class GpuFsaBaseWorkload : public BaseWorkload<QueueDescriptor>
+{
+public:
+ GpuFsaBaseWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info)
+ : BaseWorkload<QueueDescriptor>(descriptor, info)
+ {}
+
+ virtual bool SupportsTensorHandleReplacement() const override
+ {
+ return true;
+ }
+
+ // Replace input tensor handle with the given TensorHandle
+ void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+ {
+ this->m_Data.m_Inputs[slot] = tensorHandle;
+ }
+
+ // Replace output tensor handle with the given TensorHandle
+ void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
+ {
+ this->m_Data.m_Outputs[slot] = tensorHandle;
+ }
+};
+
+} //namespace armnn \ No newline at end of file