From 2b32a69f3aac5496d0a966d9740cb4854504f3d9 Mon Sep 17 00:00:00 2001
From: Cathal Corbett <cathal.corbett@arm.com>
Date: Mon, 9 Jan 2023 12:47:48 +0000
Subject: IVGCVSW-7380 Update the GpuFsa Skeleton to build and load ACL

  * Reuse cl backend to be able to create ClRuntime, ClContexts etc. for the new GpuFsa backend.
  * Can access code defined in the experimental interface dynamic_fusion.
  * No BackendModelContext as model/backend options not required for now.
  * Any of the serializer and deserializer is emitted as context caching not required.
  * No ImportTensorHandle and ImportTensorHandleFactory for now.
  * Moved tuning and IClTensorHandle code to aclCommon as it is accessed by both cl and gpuFsa.
  * Small code refactor of cl backend.
  * Added DefaultAllocatorTests to GpuFsa backend.

Signed-off-by: Cathal Corbett <cathal.corbett@arm.com>
Change-Id: I6ae591360e9d2a783aafd06e2d7bf8e0b3e623ee
---
 CMakeLists.txt                                     |   2 +-
 cmake/GlobalConfig.cmake                           |  19 +-
 src/armnn/Network.cpp                              |  14 +-
 src/backends/aclCommon/BaseMemoryManager.cpp       |  14 +-
 src/backends/aclCommon/BaseMemoryManager.hpp       |  34 +-
 src/backends/aclCommon/common.cmake                |   4 +-
 src/backends/gpuFsa/CMakeLists.txt                 |  20 +-
 src/backends/gpuFsa/GpuFsaBackend.cpp              | 172 +++++++---
 src/backends/gpuFsa/GpuFsaBackend.hpp              | 271 ++++++++++++++--
 src/backends/gpuFsa/GpuFsaBackendContext.cpp       | 230 ++++++++++++++
 src/backends/gpuFsa/GpuFsaBackendContext.hpp       |  47 +++
 .../gpuFsa/GpuFsaBackendDefaultAllocator.hpp       |  51 +++
 src/backends/gpuFsa/GpuFsaContextControl.cpp       | 163 ++++++++++
 src/backends/gpuFsa/GpuFsaContextControl.hpp       |  42 +++
 src/backends/gpuFsa/GpuFsaMemoryManager.cpp        | 101 ------
 src/backends/gpuFsa/GpuFsaMemoryManager.hpp        |  59 ----
 src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp  |   4 +-
 src/backends/gpuFsa/GpuFsaTensorHandle.cpp         | 176 -----------
 src/backends/gpuFsa/GpuFsaTensorHandle.hpp         | 350 ++++++++++++++++++---
 src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp  |  67 ++--
 src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp  |   7 +-
 src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp      |  58 +---
 src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp      |  18 +-
 src/backends/gpuFsa/backend.cmake                  |   4 +-
 src/backends/gpuFsa/backend.mk                     |  22 +-
 src/backends/gpuFsa/test/CMakeLists.txt            |   3 +-
 .../gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp    | 193 ++++++++++++
 27 files changed, 1582 insertions(+), 563 deletions(-)
 create mode 100644 src/backends/gpuFsa/GpuFsaBackendContext.cpp
 create mode 100644 src/backends/gpuFsa/GpuFsaBackendContext.hpp
 create mode 100644 src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp
 create mode 100644 src/backends/gpuFsa/GpuFsaContextControl.cpp
 create mode 100644 src/backends/gpuFsa/GpuFsaContextControl.hpp
 delete mode 100644 src/backends/gpuFsa/GpuFsaMemoryManager.cpp
 delete mode 100644 src/backends/gpuFsa/GpuFsaMemoryManager.hpp
 delete mode 100644 src/backends/gpuFsa/GpuFsaTensorHandle.cpp
 create mode 100644 src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 476e080442..19626f2862 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -502,7 +502,7 @@ endif()
 
 install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
     target_link_libraries(armnn PUBLIC ${ARMCOMPUTE_LIBRARIES})
 endif()
 
diff --git a/cmake/GlobalConfig.cmake b/cmake/GlobalConfig.cmake
index bc9117f702..8a1211246c 100644
--- a/cmake/GlobalConfig.cmake
+++ b/cmake/GlobalConfig.cmake
@@ -10,7 +10,7 @@ option(BUILD_TESTS "Build test applications" OFF)
 option(BUILD_FOR_COVERAGE "Use no optimization and output .gcno and .gcda files" OFF)
 option(ARMCOMPUTENEON "Build with ARM Compute NEON support" OFF)
 option(ARMCOMPUTECL "Build with ARM Compute OpenCL support" OFF)
-option(ARMNNGPUFSA "Build with GPU Dynamic Fusion Backend" OFF)
+option(ARMCOMPUTEGPUFSA "Build with GPU Dynamic Fusion Backend" OFF)
 option(ARMNNREF "Build with ArmNN reference support" ON)
 option(ARMNNTOSAREF "Build with TOSA reference support" OFF)
 option(PROFILING_BACKEND_STREAMLINE "Forward the armNN profiling events to DS-5/Streamline as annotations" OFF)
@@ -261,7 +261,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/profiling)
 # ARM Compute
 # Note that ARM Compute has a different folder layout depending on the branch but also on
 # whether it comes from a prepackaged archive (this is why we add several hints below)
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
     find_path(ARMCOMPUTE_INCLUDE arm_compute/core/CL/OpenCL.h
               PATHS ${ARMCOMPUTE_ROOT}/include
               PATHS ${ARMCOMPUTE_ROOT}/applications/arm_compute
@@ -330,7 +330,7 @@ if(ARMCOMPUTENEON)
 endif()
 
 # ARM Compute OpenCL backend
-if(ARMCOMPUTECL)
+if(ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
     # verify we have a valid flatbuffers include path
     find_path(FLATBUFFERS_INCLUDE_PATH flatbuffers/flatbuffers.h
               HINTS ${FLATBUFFERS_ROOT}/include /usr/local/include /usr/include)
@@ -354,15 +354,22 @@ if(ARMCOMPUTECL)
 
     include_directories(SYSTEM ${OPENCL_INCLUDE})
 
-    # Add preprocessor definition for ARM Compute OpenCL
-    add_definitions(-DARMCOMPUTECL_ENABLED)
+    if(ARMCOMPUTECL)
+        # Add preprocessor definition for ARM Compute OpenCL
+        add_definitions(-DARMCOMPUTECL_ENABLED)
+    endif()
+
+    if(ARMCOMPUTEGPUFSA)
+        # Add preprocessor definition for ARM Compute OpenCL
+        add_definitions(-DARMCOMPUTEGPUFSA_ENABLED)
+    endif()
 
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DARM_COMPUTE_DEBUG_ENABLED")
 endif()
 
 # Used by both Arm Compute backends, but should be added
 # to the search path after the system directories if necessary
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
     find_path(HALF_INCLUDE half/half.hpp)
     find_path(HALF_INCLUDE half/half.hpp
               PATHS ${ARMCOMPUTE_ROOT}/include
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 42388bfbd7..cda87e89c2 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017,2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017,2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -1582,6 +1582,18 @@ IOptimizedNetworkPtr Optimize(const Graph& inGraph,
     ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
     profiler->EnableProfiling(options.m_ProfilingEnabled);
 
+    // Some backends don't play well together. Check here before continuing.
+    {
+        std::set<BackendId> backendSet(backendPreferences.begin(), backendPreferences.end());
+        // GpuFsa cannot co-exist with GpuAcc.
+        if (backendSet.find("GpuFsa") != backendSet.end() &&
+            backendSet.find("GpuAcc") != backendSet.end())
+        {
+            throw InvalidArgumentException("The backends \"GpuAcc\" and \"GpuFsa\" cannot be specified "
+                                           "for the same optimized network.");
+        }
+    }
+
     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Optimizer");
     if (backendPreferences.empty())
     {
diff --git a/src/backends/aclCommon/BaseMemoryManager.cpp b/src/backends/aclCommon/BaseMemoryManager.cpp
index c60a4a04ae..e70d7f851d 100644
--- a/src/backends/aclCommon/BaseMemoryManager.cpp
+++ b/src/backends/aclCommon/BaseMemoryManager.cpp
@@ -1,10 +1,10 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #include "BaseMemoryManager.hpp"
 
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/PoolManager.h"
 #include "arm_compute/runtime/OffsetLifetimeManager.h"
@@ -14,7 +14,7 @@
 namespace armnn
 {
 
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
 BaseMemoryManager::BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc,
                                      MemoryAffinity memoryAffinity)
 {
@@ -104,4 +104,12 @@ ClMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryMana
 }
 #endif
 
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+std::shared_ptr<arm_compute::IMemoryGroup>
+GpuFsaMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+{
+    return std::make_shared<arm_compute::MemoryGroup>(memoryManager);
 }
+#endif
+
+}
\ No newline at end of file
diff --git a/src/backends/aclCommon/BaseMemoryManager.hpp b/src/backends/aclCommon/BaseMemoryManager.hpp
index af099f900a..c18c4830a0 100644
--- a/src/backends/aclCommon/BaseMemoryManager.hpp
+++ b/src/backends/aclCommon/BaseMemoryManager.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -7,17 +7,13 @@
 #include <armnn/backends/IMemoryManager.hpp>
 #include <armnn/backends/WorkloadFactory.hpp>
 
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
-#include <arm_compute/runtime/MemoryGroup.h>
-#endif
-
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
 #include <arm_compute/runtime/IAllocator.h>
 #include <arm_compute/runtime/IMemoryGroup.h>
 #include <arm_compute/runtime/MemoryManagerOnDemand.h>
 #endif
 
-#if defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
 #include <arm_compute/runtime/CL/CLTensorAllocator.h>
 #endif
 
@@ -39,7 +35,7 @@ public:
     void Acquire() override;
     void Release() override;
 
-#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED)
+#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) || defined(ARMCOMPUTEGPUFSA_ENABLED)
     BaseMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc, MemoryAffinity memoryAffinity);
 
     std::shared_ptr<arm_compute::MemoryManagerOnDemand>& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; }
@@ -98,4 +94,24 @@ protected:
 };
 #endif
 
-} //namespace armnn
+#if defined(ARMCOMPUTEGPUFSA_ENABLED)
+class GpuFsaMemoryManager : public BaseMemoryManager
+{
+public:
+    GpuFsaMemoryManager() {}
+    virtual ~GpuFsaMemoryManager() {}
+
+    GpuFsaMemoryManager(std::shared_ptr<arm_compute::IAllocator> alloc)
+    : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer)
+    {
+        arm_compute::CLTensorAllocator::set_global_allocator(alloc.get());
+        m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr);
+    }
+
+protected:
+    std::shared_ptr<arm_compute::IMemoryGroup>
+    CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) override;
+};
+#endif
+
+} // namespace armnn
diff --git a/src/backends/aclCommon/common.cmake b/src/backends/aclCommon/common.cmake
index 89be236a7f..1ea14951a6 100644
--- a/src/backends/aclCommon/common.cmake
+++ b/src/backends/aclCommon/common.cmake
@@ -1,9 +1,9 @@
 #
-# Copyright © 2017 Arm Ltd. All rights reserved.
+# Copyright © 2017, 2023 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 #
 
-if(ARMCOMPUTENEON OR ARMCOMPUTECL)
+if(ARMCOMPUTENEON OR ARMCOMPUTECL OR ARMCOMPUTEGPUFSA)
     add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/aclCommon)
     list(APPEND armnnLibraries armnnAclCommon)
     list(APPEND armnnUnitTestLibraries armnnAclCommonUnitTests)
diff --git a/src/backends/gpuFsa/CMakeLists.txt b/src/backends/gpuFsa/CMakeLists.txt
index f5ddb34854..635b25b2d5 100644
--- a/src/backends/gpuFsa/CMakeLists.txt
+++ b/src/backends/gpuFsa/CMakeLists.txt
@@ -1,24 +1,26 @@
 #
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 #
 
-if(ARMNNGPUFSA)
+if(ARMCOMPUTEGPUFSA)
     list(APPEND armnnGpuFsaBackend_sources
         GpuFsaBackend.cpp
         GpuFsaBackend.hpp
+        GpuFsaBackendContext.cpp
+        GpuFsaBackendContext.hpp
+        GpuFsaBackendDefaultAllocator.hpp
         GpuFsaBackendId.hpp
-        GpuFsaTensorHandle.hpp
-        GpuFsaTensorHandle.cpp
+        GpuFsaContextControl.cpp
+        GpuFsaContextControl.hpp
         GpuFsaLayerSupport.cpp
         GpuFsaLayerSupport.hpp
-        GpuFsaMemoryManager.hpp
-        GpuFsaMemoryManager.cpp
         GpuFsaRegistryInitializer.cpp
-        GpuFsaWorkloadFactory.cpp
-        GpuFsaWorkloadFactory.hpp
+        GpuFsaTensorHandle.hpp
         GpuFsaTensorHandleFactory.cpp
         GpuFsaTensorHandleFactory.hpp
+        GpuFsaWorkloadFactory.cpp
+        GpuFsaWorkloadFactory.hpp
     )
 
     add_subdirectory(workloads)
@@ -30,6 +32,8 @@ if(ARMNNGPUFSA)
 else()
     list(APPEND armnnGpuFsaBackend_sources
         GpuFsaBackendId.hpp
+        GpuFsaLayerSupport.cpp
+        GpuFsaLayerSupport.hpp
     )
 endif()
 
diff --git a/src/backends/gpuFsa/GpuFsaBackend.cpp b/src/backends/gpuFsa/GpuFsaBackend.cpp
index 9c2f4a0df6..ae7ff0c243 100644
--- a/src/backends/gpuFsa/GpuFsaBackend.cpp
+++ b/src/backends/gpuFsa/GpuFsaBackend.cpp
@@ -1,23 +1,24 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #include "GpuFsaBackend.hpp"
+#include "GpuFsaBackendContext.hpp"
+#include "GpuFsaBackendDefaultAllocator.hpp"
 #include "GpuFsaBackendId.hpp"
-#include "GpuFsaWorkloadFactory.hpp"
 #include "GpuFsaLayerSupport.hpp"
 #include "GpuFsaTensorHandleFactory.hpp"
+#include "GpuFsaWorkloadFactory.hpp"
 
-#include <armnn/BackendRegistry.hpp>
 #include <armnn/backends/IBackendContext.hpp>
 #include <armnn/backends/IMemoryManager.hpp>
-#include <armnn/utility/PolymorphicDowncast.hpp>
-#include <backendsCommon/DefaultAllocator.hpp>
-#include <backendsCommon/SubgraphUtils.hpp>
-
 #include <Optimizer.hpp>
 
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+
 namespace armnn
 {
 
@@ -27,6 +28,15 @@ const BackendId& GpuFsaBackend::GetIdStatic()
     return s_Id;
 }
 
+IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
+{
+    if (m_UsingCustomAllocator)
+    {
+        return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
+    }
+    return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+}
+
 IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
     const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
 {
@@ -34,74 +44,142 @@ IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
 }
 
 IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
-    class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const
+    TensorHandleFactoryRegistry& registry) const
 {
-    auto memoryManager = std::make_shared<GpuFsaMemoryManager>();
-
-    tensorHandleFactoryRegistry.RegisterMemoryManager(memoryManager);
+    std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
+
+    std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
 
-    auto factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
-    // Register copy and import factory pair
-    tensorHandleFactoryRegistry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId());
-    // Register the factory
-    tensorHandleFactoryRegistry.RegisterFactory(std::move(factory));
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::move(factory));
 
     return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
 }
 
-IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions&) const
+IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
+    TensorHandleFactoryRegistry& registry,
+    const ModelOptions& modelOptions,
+    MemorySourceFlags inputFlags,
+    MemorySourceFlags outputFlags) const
 {
-    return IBackendContextPtr{};
+    IgnoreUnused(modelOptions);
+
+    // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+    if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
+    if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
+
+    std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
+
+    std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::move(factory));
+
+    return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
 }
 
-IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
-    const IRuntime::CreationOptions&, IBackendProfilingPtr&)
+std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
 {
-    return IBackendProfilingContextPtr{};
+    return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
 }
 
-IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
+void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
 {
-    return std::make_unique<GpuFsaMemoryManager>();
+    std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
+
+    std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::move(factory));
+
 }
 
-IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
+void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+                                                  MemorySourceFlags inputFlags,
+                                                  MemorySourceFlags outputFlags)
 {
-    static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
-    return layerSupport;
+    // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+    if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
+    if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+    {
+        outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+    }
+
+    std::shared_ptr<GpuFsaMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
+
+    std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::move(factory));
 }
 
-OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
-                                                      const ModelOptions& modelOptions) const
+IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
 {
-    OptimizationViews optimizationViews(modelOptions);
-    optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
-
-    return optimizationViews;
+    return IBackendContextPtr{new GpuFsaBackendContext{options}};
 }
 
-std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
+IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
+    const IRuntime::CreationOptions&, IBackendProfilingPtr&)
 {
-    return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
+    return IBackendProfilingContextPtr{};
 }
 
-void GpuFsaBackend::RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry)
+IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
 {
-    auto memoryManager = std::make_shared<GpuFsaMemoryManager>();
-
-    registry.RegisterMemoryManager(memoryManager);
-
-    auto factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
-
-    // Register copy and import factory pair
-    registry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId());
-    // Register the factory
-    registry.RegisterFactory(std::move(factory));
+    static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
+    return layerSupport;
 }
 
 std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
 {
-    return std::make_unique<DefaultAllocator>();
+    return std::make_unique<GpuFsaBackendDefaultAllocator>();
+}
+
+OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
+                                                      const ModelOptions& modelOptions) const
+{
+    OptimizationViews optimizationViews(modelOptions);
+    optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
+    return optimizationViews;
 }
 
-} // namespace armnn
\ No newline at end of file
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp
index 803c6a4c66..6d886a12b1 100644
--- a/src/backends/gpuFsa/GpuFsaBackend.hpp
+++ b/src/backends/gpuFsa/GpuFsaBackend.hpp
@@ -1,56 +1,287 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
 
 #include <armnn/backends/IBackendInternal.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+
+// System includes for mapping and unmapping memory
+#include <sys/mman.h>
 
 namespace armnn
 {
 
+// add new capabilities here..
+const BackendCapabilities gpuFsaCapabilities("GpuFsa",
+                                             {
+                                                     {"NonConstWeights", false},
+                                                     {"AsyncExecution", false},
+                                                     {"ProtectedContentAllocation", true},
+                                                     {"ConstantTensorsAsInputs", true},
+                                                     {"PreImportIOTensors", false},
+                                                     {"ExternallyManagedMemory", true},
+                                                     {"MultiAxisPacking", false},
+                                                     {"SingleAxisPacking", true}
+                                             });
+
 class GpuFsaBackend : public IBackendInternal
 {
 public:
-    GpuFsaBackend() = default;
+    GpuFsaBackend() : m_CustomAllocator(nullptr) {};
+    GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)
+    {
+        std::string err;
+        UseCustomMemoryAllocator(allocator, err);
+    }
     ~GpuFsaBackend() = default;
 
     static const BackendId& GetIdStatic();
-    const BackendId& GetId() const override
-    {
-        return GetIdStatic();
-    }
+    const BackendId& GetId() const override { return GetIdStatic(); }
 
     IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;
 
     IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
-            const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
+        const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
 
-    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
-            class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const override;
+    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;
 
-    IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+    IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
+                                              const ModelOptions& modelOptions,
+                                              MemorySourceFlags inputFlags,
+                                              MemorySourceFlags outputFlags) const override;
+
+    std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+
+    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;
 
-    IBackendInternal::IBackendProfilingContextPtr
-    CreateBackendProfilingContext(const IRuntime::CreationOptions& creationOptions,
-                                  IBackendProfilingPtr& backendProfiling) override;
+    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+                                       MemorySourceFlags inputFlags,
+                                       MemorySourceFlags outputFlags) override;
+
+    IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+    IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(
+        const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;
 
     IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;
 
     OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,
                                            const ModelOptions& modelOptions) const override;
 
-    std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+    std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
 
-    void RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry) override;
+    BackendCapabilities GetCapabilities() const override
+    {
+        return gpuFsaCapabilities;
+    };
 
-    std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
+    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+                                          armnn::Optional<std::string&> errMsg) override
+    {
+        IgnoreUnused(errMsg);
+        ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";
+
+        // Set flag to signal the backend to use a custom memory allocator
+        m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));
+        m_UsingCustomAllocator = true;
+        return m_UsingCustomAllocator;
+    }
+
+    // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
+    class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator
+    {
+    public:
+        GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
+        {}
+        // Inherited methods overridden:
+        void* allocate(size_t size, size_t alignment) override
+        {
+            auto alloc = m_CustomAllocator->allocate(size, alignment);
+            return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
+        }
+        void free(void* ptr) override
+        {
+            auto hostMemPtr = m_AllocatedBufferMappings[ptr];
+            clReleaseMemObject(static_cast<cl_mem>(ptr));
+            m_CustomAllocator->free(hostMemPtr);
+        }
+        std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
+        {
+            auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
+            cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
+
+            return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),
+                                                                          hostMemPtr,
+                                                                          m_CustomAllocator->GetMemorySourceType());
+        }
+    private:
+        cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
+        {
+            // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+            auto cachelineAlignment =
+                    arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+            auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
+
+            if (source == MemorySource::Malloc)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                            CL_IMPORT_TYPE_ARM,
+                            CL_IMPORT_TYPE_HOST_ARM,
+                            0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                    "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
+            }
+            else if (source == MemorySource::DmaBuf)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                            CL_IMPORT_TYPE_ARM,
+                            CL_IMPORT_TYPE_DMA_BUF_ARM,
+                            CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,
+                            CL_TRUE,
+                            0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+                         + std::to_string(error));
+            }
+            else if (source == MemorySource::DmaBufProtected)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                                CL_IMPORT_TYPE_ARM,
+                                CL_IMPORT_TYPE_DMA_BUF_ARM,
+                                CL_IMPORT_TYPE_PROTECTED_ARM,
+                                CL_TRUE,
+                                0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+                         + std::to_string(error));
+            }
+            throw armnn::Exception(
+                    "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
+        }
+        std::shared_ptr<ICustomAllocator> m_CustomAllocator;
+        std::map<void*, void*> m_AllocatedBufferMappings;
+    };
+
+    class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
+    {
+    public:
+        // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
+        ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)
+            : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+        {
+            _mem = buffer;
+            m_HostMemPtr = hostMemPtr;
+            m_MemorySource = source;
+        }
+
+        // Inherited methods overridden :
+        void* ptr() override
+        {
+            return nullptr;
+        }
+
+        void* map(cl::CommandQueue &q, bool blocking) override
+        {
+            armnn::IgnoreUnused(q, blocking);
+            if (m_HostMemPtr == nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
+            }
+            if (_mapping != nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");
+            }
+            switch (m_MemorySource)
+            {
+                case armnn::MemorySource::Malloc:
+                    _mapping = m_HostMemPtr;
+                    return _mapping;
+                    break;
+                case armnn::MemorySource::DmaBuf:
+                case armnn::MemorySource::DmaBufProtected:
+                    // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd
+                    _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);
+                    return _mapping;
+                    break;
+                default:
+                    throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");
+                    break;
+            }
+        }
 
-private:
-    // Private members
+        void unmap(cl::CommandQueue &q) override
+        {
+            armnn::IgnoreUnused(q);
+            switch (m_MemorySource)
+            {
+                case armnn::MemorySource::Malloc:
+                    _mapping = nullptr;
+                    break;
+                case armnn::MemorySource::DmaBuf:
+                case armnn::MemorySource::DmaBufProtected:
+                    munmap(_mapping, _size);
+                    _mapping = nullptr;
+                    break;
+                default:
+                    throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");
+                    break;
+            }
+        }
+    private:
+        void* m_HostMemPtr = nullptr;
+        armnn::MemorySource m_MemorySource;
+    };
 
-protected:
-    // Protected members
+    std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;
+    bool m_UsingCustomAllocator = false;
 };
 
 } // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.cpp b/src/backends/gpuFsa/GpuFsaBackendContext.cpp
new file mode 100644
index 0000000000..72b77e0d19
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendContext.cpp
@@ -0,0 +1,230 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaBackendContext.hpp"
+#include "GpuFsaContextControl.hpp"
+
+#include <armnn/utility/Assert.hpp>
+#include <armnn/utility/PolymorphicDowncast.hpp>
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+namespace armnn
+{
+
+struct GpuFsaBackendContext::GpuFsaContextControlWrapper
+{
+    GpuFsaContextControlWrapper(arm_compute::CLTuner* tuner,
+                                arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle,
+                                bool profilingEnabled)
+        : m_GpuFsaContextControl(tuner, heuristicsHandle, profilingEnabled)
+    {}
+
+    bool Sync()
+    {
+        if (arm_compute::CLScheduler::get().context()() != NULL)
+        {
+            // Waits for all queued CL requests to finish before unloading the network they may be using.
+            try
+            {
+                // Coverity fix: arm_compute::CLScheduler::sync() may throw an exception of type cl::Error.
+                arm_compute::CLScheduler::get().sync();
+            }
+            catch (const cl::Error&)
+            {
+                ARMNN_LOG(warning) << "Runtime::UnloadNetwork(): an error occurred while waiting for "
+                                      "the queued CL requests to finish";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    void ClearClCache()
+    {
+        if (arm_compute::CLScheduler::get().context()() != NULL)
+        {
+            // There are no loaded networks left, so clear the CL cache to free up memory
+            m_GpuFsaContextControl.ClearClCache();
+        }
+    }
+
+    GpuFsaContextControl m_GpuFsaContextControl;
+};
+
+GpuFsaBackendContext::GpuFsaBackendContext(const IRuntime::CreationOptions& options)
+    : IBackendContext(options)
+    , m_TuningFile()
+{
+    bool kernelProfiling = options.m_EnableGpuProfiling;
+
+    arm_compute::CLTuner* tuner = nullptr;
+    arm_compute::CLGEMMHeuristicsHandle* mlgoTuner = nullptr;
+    bool useLegacyTunerAPI = options.m_GpuAccTunedParameters.get() != nullptr;
+    if (useLegacyTunerAPI)
+    {
+        auto clTunerParams = PolymorphicDowncast<ClTunedParameters*>(
+                                options.m_GpuAccTunedParameters.get());
+        tuner = &clTunerParams->m_Tuner;
+
+        if (tuner)
+        {
+            auto ConvertTuningLevel = [](IGpuAccTunedParameters::TuningLevel level,
+                                         armnn::IGpuAccTunedParameters::Mode mode)
+                {
+                    if (mode == armnn::IGpuAccTunedParameters::Mode::UseTunedParameters)
+                    {
+                        return TuningLevel::None;
+                    }
+
+                    switch(level)
+                    {
+                        case IGpuAccTunedParameters::TuningLevel::Rapid:
+                            return TuningLevel::Rapid;
+                        case IGpuAccTunedParameters::TuningLevel::Normal:
+                            return TuningLevel::Normal;
+                        case IGpuAccTunedParameters::TuningLevel::Exhaustive:
+                            return TuningLevel::Exhaustive;
+                        default:
+                        {
+                            ARMNN_ASSERT_MSG(false, "Tuning level not recognised.");
+                            return TuningLevel::None;
+                        }
+                    }
+                };
+
+            TuningLevel tuningLevel = ConvertTuningLevel(clTunerParams->m_TuningLevel, clTunerParams->m_Mode);
+            ConfigureTuner(*tuner, tuningLevel);
+        }
+    }
+    else //New backend options API
+    {
+        const TuningLevel defaultTuningLevel = TuningLevel::None;
+        auto tuningLevel = defaultTuningLevel;
+
+        ParseOptions(options.m_BackendOptions, "GpuFsa", [&](std::string name, const BackendOptions::Var& value)
+            {
+                if (name == "KernelProfilingEnabled")
+                {
+                    kernelProfiling |= ParseBooleanBackendOption(value, false);
+                } else if (name == "TuningFile")
+                {
+                    m_TuningFile = ParseStringBackendOption(value, "");
+                } else if (name == "TuningLevel")
+                {
+                    tuningLevel = ParseTuningLevel(value, defaultTuningLevel);
+                }
+                else if (name == "MLGOTuningFilePath")
+                {
+                    m_MLGOTuningFile = ParseStringBackendOption(value, "");
+                }
+            });
+
+        // Create the tuner, in tuning mode initially.
+        m_Tuner = std::make_unique<arm_compute::CLTuner>(true);
+
+        ConfigureTuner(*(m_Tuner.get()), tuningLevel);
+
+        if (!m_TuningFile.empty())
+        {
+            try
+            {
+                ARMNN_LOG(info) << "Loading Gpu tuning data from file: " << m_TuningFile;
+                m_Tuner->load_from_file(m_TuningFile.c_str());
+            }
+            catch (const std::exception& e)
+            {
+                // Warn if not tuning, otherwise tuning will generate new params
+                if (tuningLevel == TuningLevel::None)
+                {
+                    ARMNN_LOG(warning) << "Could not load GpuFsa tuner data file.";
+                }
+            }
+        }
+
+        if (!m_MLGOTuningFile.empty())
+        {
+            try
+            {
+                ARMNN_LOG(info) << "Loading Gpu MLGO tuning data from file: " << m_TuningFile;
+                if(m_MLGOTuner.reload_from_file(m_MLGOTuningFile.c_str()))
+                {
+                    mlgoTuner = &m_MLGOTuner;
+                }
+            }
+            catch (const std::exception& e)
+            {
+                ARMNN_LOG(warning) << "Could not load GpuFsa MLGO tuner data file.";
+            }
+        }
+
+        tuner = m_Tuner.get();
+    }
+
+    m_GpuFsaContextControlWrapper = std::make_unique<GpuFsaContextControlWrapper>(
+            tuner,
+            mlgoTuner,
+            kernelProfiling
+    );
+}
+
+bool GpuFsaBackendContext::BeforeLoadNetwork(NetworkId)
+{
+    return true;
+}
+
+bool GpuFsaBackendContext::AfterLoadNetwork(NetworkId networkId)
+{
+    {
+        std::lock_guard<std::mutex> lockGuard(m_Mutex);
+        m_NetworkIds.insert(networkId);
+    }
+    return true;
+}
+
+bool GpuFsaBackendContext::BeforeUnloadNetwork(NetworkId)
+{
+    return m_GpuFsaContextControlWrapper->Sync();
+}
+
+bool GpuFsaBackendContext::AfterUnloadNetwork(NetworkId networkId)
+{
+    bool clearCache = false;
+    {
+        std::lock_guard<std::mutex> lockGuard(m_Mutex);
+        m_NetworkIds.erase(networkId);
+        clearCache = m_NetworkIds.empty();
+    }
+
+    if (clearCache)
+    {
+        m_GpuFsaContextControlWrapper->ClearClCache();
+    }
+
+    return true;
+}
+
+bool GpuFsaBackendContext::AfterEnqueueWorkload(NetworkId)
+{
+    return m_GpuFsaContextControlWrapper->Sync();
+}
+
+GpuFsaBackendContext::~GpuFsaBackendContext()
+{
+    if (m_Tuner && !m_TuningFile.empty())
+    {
+        try
+        {
+            m_Tuner->save_to_file(m_TuningFile.c_str());
+        }
+        catch(const std::exception& e)
+        {
+            ARMNN_LOG(warning) << "Could not save GpuFsa tuner data to file " << m_TuningFile;
+        }
+    }
+}
+
+} // namespace armnn
\ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackendContext.hpp b/src/backends/gpuFsa/GpuFsaBackendContext.hpp
new file mode 100644
index 0000000000..271688fd99
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendContext.hpp
@@ -0,0 +1,47 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/backends/IBackendContext.hpp>
+#include <unordered_set>
+#include <mutex>
+
+#include <arm_compute/runtime/CL/CLTuner.h>
+#include <arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h>
+
+namespace armnn
+{
+
+class GpuFsaBackendContext : public IBackendContext
+{
+public:
+    GpuFsaBackendContext(const IRuntime::CreationOptions& options);
+
+    bool BeforeLoadNetwork(NetworkId networkId) override;
+    bool AfterLoadNetwork(NetworkId networkId) override;
+
+    bool BeforeUnloadNetwork(NetworkId networkId) override;
+    bool AfterUnloadNetwork(NetworkId networkId) override;
+
+    bool AfterEnqueueWorkload(NetworkId networkId) override;
+
+    ~GpuFsaBackendContext() override;
+
+private:
+    std::mutex m_Mutex;
+    struct GpuFsaContextControlWrapper;
+    std::unique_ptr<GpuFsaContextControlWrapper> m_GpuFsaContextControlWrapper;
+
+    std::unordered_set<NetworkId> m_NetworkIds;
+
+    std::unique_ptr<arm_compute::CLTuner> m_Tuner;
+    std::string m_TuningFile;
+
+protected:
+    arm_compute::CLGEMMHeuristicsHandle m_MLGOTuner;
+    std::string m_MLGOTuningFile;
+};
+
+} // namespace armnn
\ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp
new file mode 100644
index 0000000000..c57ff63b92
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackendDefaultAllocator.hpp
@@ -0,0 +1,51 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <memory>
+
+#include <armnn/MemorySources.hpp>
+#include <armnn/utility/IgnoreUnused.hpp>
+
+namespace armnn
+{
+
+/**
+* Default Memory Allocator class returned from IBackendInternal::GetDefaultAllocator(MemorySource)
+*/
+class GpuFsaBackendDefaultAllocator : public ICustomAllocator
+{
+public:
+    GpuFsaBackendDefaultAllocator() = default;
+
+    void* allocate(size_t size, size_t alignment = 0) override
+    {
+        IgnoreUnused(alignment);
+        cl_mem buf{ clCreateBuffer(arm_compute::CLScheduler::get().context().get(),
+                                   CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                   size,
+                                   nullptr,
+                                   nullptr)};
+        return static_cast<void *>(buf);
+    }
+
+    void free(void* ptr) override
+    {
+        ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+        clReleaseMemObject(static_cast<cl_mem>(ptr));
+    }
+
+    MemorySource GetMemorySourceType() override
+    {
+        return MemorySource::Gralloc;
+    }
+
+    void* GetMemoryRegionAtOffset(void* buffer, size_t offset, size_t alignment = 0) override
+    {
+        IgnoreUnused(alignment);
+        return static_cast<char*>(buffer) + offset;
+    }
+};
+} // namespace armnn
\ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaContextControl.cpp b/src/backends/gpuFsa/GpuFsaContextControl.cpp
new file mode 100644
index 0000000000..795de5e14d
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaContextControl.cpp
@@ -0,0 +1,163 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GpuFsaContextControl.hpp"
+
+#include <armnn/Exceptions.hpp>
+#include <armnn/utility/Assert.hpp>
+#include <LeakChecking.hpp>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <fmt/format.h>
+
+namespace cl
+{
+class Context;
+class CommandQueue;
+class Device;
+}
+
+namespace armnn
+{
+
+GpuFsaContextControl::GpuFsaContextControl(arm_compute::CLTuner *tuner,
+                                           arm_compute::CLGEMMHeuristicsHandle* heuristicsHandle,
+                                           bool profilingEnabled)
+    : m_Tuner(tuner)
+    , m_HeuristicsHandle(heuristicsHandle)
+    , m_ProfilingEnabled(profilingEnabled)
+{
+    try
+    {
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+
+        // Selects default platform for the first element.
+        cl::Platform::setDefault(platforms[0]);
+
+        std::vector<cl::Device> devices;
+        platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
+
+        // Selects default device for the first element.
+        cl::Device::setDefault(devices[0]);
+    }
+    catch (const cl::Error& clError)
+    {
+        throw ClRuntimeUnavailableException(fmt::format(
+            "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}",
+            clError.what(), clError.err()));
+    }
+
+    // Removes the use of global CL context.
+    cl::Context::setDefault(cl::Context{});
+    ARMNN_ASSERT(cl::Context::getDefault()() == NULL);
+
+    // Removes the use of global CL command queue.
+    cl::CommandQueue::setDefault(cl::CommandQueue{});
+    ARMNN_ASSERT(cl::CommandQueue::getDefault()() == NULL);
+
+    // Always load the OpenCL runtime.
+    LoadOpenClRuntime();
+}
+
+GpuFsaContextControl::~GpuFsaContextControl()
+{
+    // Load the OpencCL runtime without the tuned parameters to free the memory for them.
+    try
+    {
+        UnloadOpenClRuntime();
+    }
+    catch (const cl::Error& clError)
+    {
+        // This should not happen, it is ignored if it does.
+
+        // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an
+        // exception of type std::length_error.
+        // Using stderr instead in this context as there is no point in nesting try-catch blocks here.
+        std::cerr << "A CL error occurred unloading the runtime tuner parameters: "
+                  << clError.what() << ". CL error code is: " << clError.err() << std::endl;
+    }
+}
+
+void GpuFsaContextControl::LoadOpenClRuntime()
+{
+    DoLoadOpenClRuntime(true);
+}
+
+void GpuFsaContextControl::UnloadOpenClRuntime()
+{
+    DoLoadOpenClRuntime(false);
+}
+
+void GpuFsaContextControl::DoLoadOpenClRuntime(bool updateTunedParameters)
+{
+    cl::Device device = cl::Device::getDefault();
+    cl::Context context;
+    cl::CommandQueue commandQueue;
+
+    if (arm_compute::CLScheduler::get().is_initialised() && arm_compute::CLScheduler::get().context()() != NULL)
+    {
+        // Wait for all queued CL requests to finish before reinitialising it.
+        arm_compute::CLScheduler::get().sync();
+    }
+
+    try
+    {
+        arm_compute::CLKernelLibrary::get().clear_programs_cache();
+        // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no
+        // context references); it is initialised again, with a proper context, later.
+        arm_compute::CLScheduler::get().init(context, commandQueue, device);
+        arm_compute::CLKernelLibrary::get().init(".", context, device);
+
+        {
+            //
+            // Here we replace the context with a new one in which
+            // the memory leak checks show it as an extra allocation but
+            // because of the scope of the leak checks, it doesn't count
+            // the disposal of the original object. On the other hand it
+            // does count the creation of this context which it flags
+            // as a memory leak. By adding the following line we prevent
+            // this to happen.
+            //
+            ARMNN_DISABLE_LEAK_CHECKING_IN_SCOPE();
+            context = cl::Context(device);
+        }
+
+        // NOTE: In this specific case profiling has to be enabled on the command queue
+        // in order for the CLTuner to work.
+        bool profilingNeededForClTuner = updateTunedParameters && m_Tuner &&
+            m_Tuner->tune_new_kernels();
+
+        if (m_ProfilingEnabled || profilingNeededForClTuner)
+        {
+            // Create a new queue with profiling enabled.
+            commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
+        }
+        else
+        {
+            // Use default queue.
+            commandQueue = cl::CommandQueue(context, device);
+        }
+    }
+    catch (const cl::Error& clError)
+    {
+        throw ClRuntimeUnavailableException(fmt::format(
+            "Could not initialize the CL runtime. Error description: {0}. CL error code: {1}",
+            clError.what(), clError.err()));
+    }
+
+    // Note the first argument (path to cl source code) will be ignored as they should be embedded in the armcompute.
+    arm_compute::CLKernelLibrary::get().init(".", context, device);
+    arm_compute::CLScheduler::get().init(context, commandQueue, device, m_Tuner, m_HeuristicsHandle);
+}
+
+void GpuFsaContextControl::ClearClCache()
+{
+    DoLoadOpenClRuntime(true);
+}
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaContextControl.hpp b/src/backends/gpuFsa/GpuFsaContextControl.hpp
new file mode 100644
index 0000000000..f77b1fbdd4
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaContextControl.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <aclCommon/ArmComputeTuningUtils.hpp>
+
+namespace armnn
+{
+
+// ARM Compute OpenCL context control.
+class GpuFsaContextControl
+{
+public:
+
+    GpuFsaContextControl(arm_compute::CLTuner* = nullptr,
+                         arm_compute::CLGEMMHeuristicsHandle* = nullptr,
+                         bool profilingEnabled = false);
+
+    virtual ~GpuFsaContextControl();
+
+    void LoadOpenClRuntime();
+
+    // Users should call this (after freeing all of the cl::Context objects they use)
+    // to release the cached memory used by the compute library.
+    void UnloadOpenClRuntime();
+
+    // Clear the CL cache, without losing the tuned parameter settings.
+    void ClearClCache();
+
+private:
+
+    void DoLoadOpenClRuntime(bool updateTunedParameters);
+
+    arm_compute::CLTuner* m_Tuner;
+    arm_compute::CLGEMMHeuristicsHandle* m_HeuristicsHandle;
+
+    bool m_ProfilingEnabled;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp b/src/backends/gpuFsa/GpuFsaMemoryManager.cpp
deleted file mode 100644
index 4eefb87d88..0000000000
--- a/src/backends/gpuFsa/GpuFsaMemoryManager.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#include "GpuFsaMemoryManager.hpp"
-
-#include <armnn/utility/Assert.hpp>
-
-#include <algorithm>
-
-namespace armnn
-{
-
-GpuFsaMemoryManager::GpuFsaMemoryManager()
-{}
-
-GpuFsaMemoryManager::~GpuFsaMemoryManager()
-{}
-
-GpuFsaMemoryManager::Pool* GpuFsaMemoryManager::Manage(unsigned int numBytes)
-{
-    if (!m_FreePools.empty())
-    {
-        Pool* res = m_FreePools.back();
-        m_FreePools.pop_back();
-        res->Reserve(numBytes);
-        return res;
-    }
-    else
-    {
-        m_Pools.push_front(Pool(numBytes));
-        return &m_Pools.front();
-    }
-}
-
-void GpuFsaMemoryManager::Allocate(GpuFsaMemoryManager::Pool* pool)
-{
-    ARMNN_ASSERT(pool);
-    m_FreePools.push_back(pool);
-}
-
-void* GpuFsaMemoryManager::GetPointer(GpuFsaMemoryManager::Pool* pool)
-{
-    return pool->GetPointer();
-}
-
-void GpuFsaMemoryManager::Acquire()
-{
-    for (Pool &pool: m_Pools)
-    {
-        pool.Acquire();
-    }
-}
-
-void GpuFsaMemoryManager::Release()
-{
-    for (Pool &pool: m_Pools)
-    {
-        pool.Release();
-    }
-}
-
-GpuFsaMemoryManager::Pool::Pool(unsigned int numBytes)
-        : m_Size(numBytes),
-          m_Pointer(nullptr)
-{}
-
-GpuFsaMemoryManager::Pool::~Pool()
-{
-    if (m_Pointer)
-    {
-        Release();
-    }
-}
-
-void* GpuFsaMemoryManager::Pool::GetPointer()
-{
-    ARMNN_ASSERT_MSG(m_Pointer, "GpuFsaMemoryManager::Pool::GetPointer() called when memory not acquired");
-    return m_Pointer;
-}
-
-void GpuFsaMemoryManager::Pool::Reserve(unsigned int numBytes)
-{
-    ARMNN_ASSERT_MSG(!m_Pointer, "GpuFsaMemoryManager::Pool::Reserve() cannot be called after memory acquired");
-    m_Size = std::max(m_Size, numBytes);
-}
-
-void GpuFsaMemoryManager::Pool::Acquire()
-{
-    ARMNN_ASSERT_MSG(!m_Pointer, "GpuFsaMemoryManager::Pool::Acquire() called when memory already acquired");
-    m_Pointer = ::operator new(size_t(m_Size));
-}
-
-void GpuFsaMemoryManager::Pool::Release()
-{
-    ARMNN_ASSERT_MSG(m_Pointer, "GpuFsaMemoryManager::Pool::Release() called when memory not acquired");
-    ::operator delete(m_Pointer);
-    m_Pointer = nullptr;
-}
-
-}
\ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp b/src/backends/gpuFsa/GpuFsaMemoryManager.hpp
deleted file mode 100644
index 636b839a51..0000000000
--- a/src/backends/gpuFsa/GpuFsaMemoryManager.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#pragma once
-
-#include <armnn/backends/IMemoryManager.hpp>
-
-#include <forward_list>
-#include <vector>
-
-namespace armnn
-{
-
-// A dummy MemoryManager which will be deleted once the GpuFsa Backend is integrated with ClMemoryManager
-class GpuFsaMemoryManager : public IMemoryManager
-{
-public:
-    GpuFsaMemoryManager();
-    virtual ~GpuFsaMemoryManager();
-
-    class Pool;
-
-    Pool* Manage(unsigned int numBytes);
-
-    void Allocate(Pool *pool);
-
-    void* GetPointer(Pool *pool);
-
-    void Acquire() override;
-    void Release() override;
-
-    class Pool
-    {
-    public:
-        Pool(unsigned int numBytes);
-        ~Pool();
-
-        void Acquire();
-        void Release();
-
-        void* GetPointer();
-
-        void Reserve(unsigned int numBytes);
-
-    private:
-        unsigned int m_Size;
-        void* m_Pointer;
-    };
-
-private:
-    GpuFsaMemoryManager(const GpuFsaMemoryManager&) = delete; // Noncopyable
-    GpuFsaMemoryManager& operator=(const GpuFsaMemoryManager&) = delete; // Noncopyable
-
-    std::forward_list<Pool> m_Pools;
-    std::vector<Pool*> m_FreePools;
-};
-
-}
diff --git a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
index 875b7d7112..9efb300576 100644
--- a/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
+++ b/src/backends/gpuFsa/GpuFsaRegistryInitializer.cpp
@@ -1,9 +1,11 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+
 #include "GpuFsaBackend.hpp"
 #include <armnn/BackendRegistry.hpp>
+
 namespace
 {
 using namespace armnn;
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp b/src/backends/gpuFsa/GpuFsaTensorHandle.cpp
deleted file mode 100644
index e806be49bb..0000000000
--- a/src/backends/gpuFsa/GpuFsaTensorHandle.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#include "GpuFsaTensorHandle.hpp"
-
-namespace armnn
-{
-GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo,
-                                       std::shared_ptr<GpuFsaMemoryManager>& memoryManager)
-    : m_TensorInfo(tensorInfo)
-    , m_MemoryManager(memoryManager)
-    , m_Pool(nullptr)
-    , m_UnmanagedMemory(nullptr)
-    , m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Undefined))
-    , m_Imported(false)
-    , m_IsImportEnabled(false)
-{}
-
-GpuFsaTensorHandle::GpuFsaTensorHandle(const TensorInfo& tensorInfo,
-                                       MemorySourceFlags importFlags)
-    : m_TensorInfo(tensorInfo)
-    , m_Pool(nullptr)
-    , m_UnmanagedMemory(nullptr)
-    , m_ImportFlags(importFlags)
-    , m_Imported(false)
-    , m_IsImportEnabled(true)
-{}
-
-GpuFsaTensorHandle::~GpuFsaTensorHandle()
-{
-    if (!m_Pool)
-    {
-        // unmanaged
-        if (!m_Imported)
-        {
-            ::operator delete(m_UnmanagedMemory);
-        }
-    }
-}
-
-void GpuFsaTensorHandle::Manage()
-{
-    if (!m_IsImportEnabled)
-    {
-        ARMNN_ASSERT_MSG(!m_Pool, "GpuFsaTensorHandle::Manage() called twice");
-        ARMNN_ASSERT_MSG(!m_UnmanagedMemory, "GpuFsaTensorHandle::Manage() called after Allocate()");
-
-        m_Pool = m_MemoryManager->Manage(m_TensorInfo.GetNumBytes());
-    }
-}
-
-void GpuFsaTensorHandle::Allocate()
-{
-    // If import is enabled, do not allocate the tensor
-    if (!m_IsImportEnabled)
-    {
-
-        if (!m_UnmanagedMemory)
-        {
-            if (!m_Pool)
-            {
-                // unmanaged
-                m_UnmanagedMemory = ::operator new(m_TensorInfo.GetNumBytes());
-            }
-            else
-            {
-                m_MemoryManager->Allocate(m_Pool);
-            }
-        }
-        else
-        {
-            throw InvalidArgumentException("GpuFsaTensorHandle::Allocate Trying to allocate a GpuFsaTensorHandle"
-                                           "that already has allocated memory.");
-        }
-    }
-}
-
-const void* GpuFsaTensorHandle::Map(bool /*unused*/) const
-{
-    return GetPointer();
-}
-
-void* GpuFsaTensorHandle::GetPointer() const
-{
-    if (m_UnmanagedMemory)
-    {
-        return m_UnmanagedMemory;
-    }
-    else if (m_Pool)
-    {
-        return m_MemoryManager->GetPointer(m_Pool);
-    }
-    else
-    {
-        throw NullPointerException("GpuFsaTensorHandle::GetPointer called on unmanaged, unallocated tensor handle");
-    }
-}
-
-void GpuFsaTensorHandle::CopyOutTo(void* dest) const
-{
-    const void *src = GetPointer();
-    ARMNN_ASSERT(src);
-    memcpy(dest, src, m_TensorInfo.GetNumBytes());
-}
-
-void GpuFsaTensorHandle::CopyInFrom(const void* src)
-{
-    void *dest = GetPointer();
-    ARMNN_ASSERT(dest);
-    memcpy(dest, src, m_TensorInfo.GetNumBytes());
-}
-
-bool GpuFsaTensorHandle::Import(void* memory, MemorySource source)
-{
-    if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
-    {
-        if (m_IsImportEnabled && source == MemorySource::Malloc)
-        {
-            // Check memory alignment
-            if(!CanBeImported(memory, source))
-            {
-                if (m_Imported)
-                {
-                    m_Imported = false;
-                    m_UnmanagedMemory = nullptr;
-                }
-                return false;
-            }
-
-            // m_UnmanagedMemory not yet allocated.
-            if (!m_Imported && !m_UnmanagedMemory)
-            {
-                m_UnmanagedMemory = memory;
-                m_Imported = true;
-                return true;
-            }
-
-            // m_UnmanagedMemory initially allocated with Allocate().
-            if (!m_Imported && m_UnmanagedMemory)
-            {
-                return false;
-            }
-
-            // m_UnmanagedMemory previously imported.
-            if (m_Imported)
-            {
-                m_UnmanagedMemory = memory;
-                return true;
-            }
-        }
-    }
-
-    return false;
-}
-
-bool GpuFsaTensorHandle::CanBeImported(void* memory, MemorySource source)
-{
-    if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
-    {
-        if (m_IsImportEnabled && source == MemorySource::Malloc)
-        {
-            uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType());
-            if (reinterpret_cast<uintptr_t>(memory) % alignment)
-            {
-                return false;
-            }
-            return true;
-        }
-    }
-    return false;
-}
-
-
-
-}
\ No newline at end of file
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
index b2da50a467..d6901d1225 100644
--- a/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
+++ b/src/backends/gpuFsa/GpuFsaTensorHandle.hpp
@@ -1,83 +1,361 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
 
-#include <armnn/backends/TensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
 
-#include "GpuFsaMemoryManager.hpp"
+#include <armnn/utility/PolymorphicDowncast.hpp>
+#include <Half.hpp>
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/IMemoryGroup.h>
+#include <arm_compute/runtime/MemoryGroup.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Coordinates.h>
+
+#include <aclCommon/IClTensorHandle.hpp>
 
 namespace armnn
 {
 
-// An implementation of ITensorHandle with simple "bump the pointer" memory-management behaviour
-// Will be refactored to look more like ClTensorHandle.hpp and use ClMemoryManager instead of GpuFsaMemoryManager
-class GpuFsaTensorHandle : public ITensorHandle
+class GpuFsaTensorHandle : public IClTensorHandle
 {
 public:
-    GpuFsaTensorHandle(const TensorInfo& tensorInfo, std::shared_ptr<GpuFsaMemoryManager>& memoryManager);
+    GpuFsaTensorHandle(const TensorInfo& tensorInfo)
+        : m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Undefined)),
+          m_Imported(false),
+          m_IsImportEnabled(false)
+    {
+        armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo);
+    }
 
-    GpuFsaTensorHandle(const TensorInfo& tensorInfo, MemorySourceFlags importFlags);
+    GpuFsaTensorHandle(const TensorInfo& tensorInfo,
+                   DataLayout dataLayout,
+                   MemorySourceFlags importFlags = static_cast<MemorySourceFlags>(MemorySource::Undefined))
+        : m_ImportFlags(importFlags),
+          m_Imported(false),
+          m_IsImportEnabled(false)
+    {
+        armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout);
+    }
 
-    ~GpuFsaTensorHandle();
+    arm_compute::CLTensor& GetTensor() override { return m_Tensor; }
+    arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; }
+    virtual void Allocate() override
+    {
+        // If we have enabled Importing, don't allocate the tensor
+        if (m_IsImportEnabled)
+        {
+            throw MemoryImportException("GpuFsaTensorHandle::Attempting to allocate memory when importing");
+        }
+        else
+        {
+            armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);
+        }
 
-    virtual void Manage() override;
+    }
 
-    virtual void Allocate() override;
+    virtual void Manage() override
+    {
+        // If we have enabled Importing, don't manage the tensor
+        if (m_IsImportEnabled)
+        {
+            throw MemoryImportException("GpuFsaTensorHandle::Attempting to manage memory when importing");
+        }
+        else
+        {
+            assert(m_MemoryGroup != nullptr);
+            m_MemoryGroup->manage(&m_Tensor);
+        }
+    }
 
-    virtual ITensorHandle* GetParent() const override
+    virtual const void* Map(bool blocking = true) const override
     {
-        return nullptr;
+        const_cast<arm_compute::CLTensor*>(&m_Tensor)->map(blocking);
+        return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
     }
 
-    virtual const void* Map(bool /* blocking = true */) const override;
-    using ITensorHandle::Map;
+    virtual void Unmap() const override { const_cast<arm_compute::CLTensor*>(&m_Tensor)->unmap(); }
+
+    virtual ITensorHandle* GetParent() const override { return nullptr; }
+
+    virtual arm_compute::DataType GetDataType() const override
+    {
+        return m_Tensor.info()->data_type();
+    }
 
-    virtual void Unmap() const override
-    {}
+    virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override
+    {
+        m_MemoryGroup = PolymorphicPointerDowncast<arm_compute::MemoryGroup>(memoryGroup);
+    }
 
     TensorShape GetStrides() const override
     {
-        return GetUnpaddedTensorStrides(m_TensorInfo);
+        return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
     }
 
     TensorShape GetShape() const override
     {
-        return m_TensorInfo.GetShape();
+        return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
     }
 
-    const TensorInfo& GetTensorInfo() const
+    void SetImportFlags(MemorySourceFlags importFlags)
     {
-        return m_TensorInfo;
+        m_ImportFlags = importFlags;
     }
 
-    virtual MemorySourceFlags GetImportFlags() const override
+    MemorySourceFlags GetImportFlags() const override
     {
         return m_ImportFlags;
     }
 
-    virtual bool Import(void* memory, MemorySource source) override;
-    virtual bool CanBeImported(void* memory, MemorySource source) override;
+    void SetImportEnabledFlag(bool importEnabledFlag)
+    {
+        m_IsImportEnabled = importEnabledFlag;
+    }
 
-private:
-    // Only used for testing
-    void CopyOutTo(void*) const override;
-    void CopyInFrom(const void*) override;
+    virtual bool Import(void* /*memory*/, MemorySource source) override
+    {
+        if (m_ImportFlags & static_cast<MemorySourceFlags>(source))
+        {
+            throw MemoryImportException("GpuFsaTensorHandle::Incorrect import flag");
+        }
+        m_Imported = false;
+        return false;
+    }
 
-    void* GetPointer() const;
+    virtual bool CanBeImported(void* /*memory*/, MemorySource /*source*/) override
+    {
+        // This TensorHandle can never import.
+        return false;
+    }
 
-    GpuFsaTensorHandle(const GpuFsaTensorHandle& other) = delete; // noncopyable
-    GpuFsaTensorHandle& operator=(const GpuFsaTensorHandle& other) = delete; //noncopyable
+private:
+    // Only used for testing
+    void CopyOutTo(void* memory) const override
+    {
+        const_cast<armnn::GpuFsaTensorHandle*>(this)->Map(true);
+        switch(this->GetDataType())
+        {
+            case arm_compute::DataType::F32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<float*>(memory));
+                break;
+            case arm_compute::DataType::U8:
+            case arm_compute::DataType::QASYMM8:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<uint8_t*>(memory));
+                break;
+            case arm_compute::DataType::QSYMM8:
+            case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+            case arm_compute::DataType::QASYMM8_SIGNED:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int8_t*>(memory));
+                break;
+            case arm_compute::DataType::F16:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<armnn::Half*>(memory));
+                break;
+            case arm_compute::DataType::S16:
+            case arm_compute::DataType::QSYMM16:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int16_t*>(memory));
+                break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int32_t*>(memory));
+                break;
+            default:
+            {
+                throw armnn::UnimplementedException();
+            }
+        }
+        const_cast<armnn::GpuFsaTensorHandle*>(this)->Unmap();
+    }
 
-    TensorInfo m_TensorInfo;
+    // Only used for testing
+    void CopyInFrom(const void* memory) override
+    {
+        this->Map(true);
+        switch(this->GetDataType())
+        {
+            case arm_compute::DataType::F32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const float*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::U8:
+            case arm_compute::DataType::QASYMM8:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::F16:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::S16:
+            case arm_compute::DataType::QSYMM8:
+            case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+            case arm_compute::DataType::QASYMM8_SIGNED:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::QSYMM16:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            default:
+            {
+                throw armnn::UnimplementedException();
+            }
+        }
+        this->Unmap();
+    }
 
-    std::shared_ptr<GpuFsaMemoryManager> m_MemoryManager;
-    GpuFsaMemoryManager::Pool* m_Pool;
-    mutable void* m_UnmanagedMemory;
+    arm_compute::CLTensor m_Tensor;
+    std::shared_ptr<arm_compute::MemoryGroup> m_MemoryGroup;
     MemorySourceFlags m_ImportFlags;
     bool m_Imported;
     bool m_IsImportEnabled;
 };
 
-}
\ No newline at end of file
+class GpuFsaSubTensorHandle : public IClTensorHandle
+{
+public:
+    GpuFsaSubTensorHandle(IClTensorHandle* parent,
+                      const arm_compute::TensorShape& shape,
+                      const arm_compute::Coordinates& coords)
+        : m_Tensor(&parent->GetTensor(), shape, coords)
+    {
+        parentHandle = parent;
+    }
+
+    arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; }
+    arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; }
+
+    virtual void Allocate() override {}
+    virtual void Manage() override {}
+
+    virtual const void* Map(bool blocking = true) const override
+    {
+        const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->map(blocking);
+        return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+    }
+    virtual void Unmap() const override { const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->unmap(); }
+
+    virtual ITensorHandle* GetParent() const override { return parentHandle; }
+
+    virtual arm_compute::DataType GetDataType() const override
+    {
+        return m_Tensor.info()->data_type();
+    }
+
+    virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+    TensorShape GetStrides() const override
+    {
+        return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+    }
+
+    TensorShape GetShape() const override
+    {
+        return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+    }
+
+private:
+    // Only used for testing
+    void CopyOutTo(void* memory) const override
+    {
+        const_cast<GpuFsaSubTensorHandle*>(this)->Map(true);
+        switch(this->GetDataType())
+        {
+            case arm_compute::DataType::F32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<float*>(memory));
+                break;
+            case arm_compute::DataType::U8:
+            case arm_compute::DataType::QASYMM8:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<uint8_t*>(memory));
+                break;
+            case arm_compute::DataType::F16:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<armnn::Half*>(memory));
+                break;
+            case arm_compute::DataType::QSYMM8:
+            case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+            case arm_compute::DataType::QASYMM8_SIGNED:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int8_t*>(memory));
+                break;
+            case arm_compute::DataType::S16:
+            case arm_compute::DataType::QSYMM16:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int16_t*>(memory));
+                break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<int32_t*>(memory));
+                break;
+            default:
+            {
+                throw armnn::UnimplementedException();
+            }
+        }
+        const_cast<GpuFsaSubTensorHandle*>(this)->Unmap();
+    }
+
+    // Only used for testing
+    void CopyInFrom(const void* memory) override
+    {
+        this->Map(true);
+        switch(this->GetDataType())
+        {
+            case arm_compute::DataType::F32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const float*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::U8:
+            case arm_compute::DataType::QASYMM8:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::F16:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::QSYMM8:
+            case arm_compute::DataType::QSYMM8_PER_CHANNEL:
+            case arm_compute::DataType::QASYMM8_SIGNED:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int8_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::S16:
+            case arm_compute::DataType::QSYMM16:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int16_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            case arm_compute::DataType::S32:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const int32_t*>(memory),
+                                                                 this->GetTensor());
+                break;
+            default:
+            {
+                throw armnn::UnimplementedException();
+            }
+        }
+        this->Unmap();
+    }
+
+    mutable arm_compute::CLSubTensor m_Tensor;
+    ITensorHandle* parentHandle = nullptr;
+};
+
+} // namespace armnn
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
index cd9d8cd64d..c1a34d24e5 100644
--- a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
+++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.cpp
@@ -1,32 +1,50 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #include "GpuFsaTensorHandle.hpp"
 #include "GpuFsaTensorHandleFactory.hpp"
 
-#include "armnn/Logging.hpp"
-#include <armnn/utility/IgnoreUnused.hpp>
-
 namespace armnn
 {
 
 using FactoryId = ITensorHandleFactory::FactoryId;
 
-const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic()
-{
-    static const FactoryId s_Id(GpuFsaTensorHandleFactoryId());
-    return s_Id;
-}
-
 std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateSubTensorHandle(ITensorHandle& parent,
-                                                                                const TensorShape& subTensorShape,
-                                                                                const unsigned int* subTensorOrigin)
-                                                                                const
+                                                                            const TensorShape& subTensorShape,
+                                                                            const unsigned int* subTensorOrigin) const
 {
-    IgnoreUnused(parent, subTensorShape, subTensorOrigin);
-    return nullptr;
+    arm_compute::Coordinates coords;
+    arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape);
+
+    coords.set_num_dimensions(subTensorShape.GetNumDimensions());
+    for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); ++i)
+    {
+        // Arm compute indexes tensor coords in reverse order.
+        unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1;
+        coords.set(i, armnn::numeric_cast<int>(subTensorOrigin[revertedIndex]));
+    }
+
+    const arm_compute::TensorShape parentShape = armcomputetensorutils::BuildArmComputeTensorShape(parent.GetShape());
+
+    // In order for ACL to support subtensors the concat axis cannot be on x or y and the values of x and y
+    // must match the parent shapes
+    if (coords.x() != 0 || coords.y() != 0)
+    {
+        return nullptr;
+    }
+    if ((parentShape.x() != shape.x()) || (parentShape.y() != shape.y()))
+    {
+        return nullptr;
+    }
+
+    if (!::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, parentShape, coords, shape))
+    {
+        return nullptr;
+    }
+
+    return std::make_unique<GpuFsaSubTensorHandle>(PolymorphicDowncast<IClTensorHandle*>(&parent), shape, coords);
 }
 
 std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
@@ -43,25 +61,32 @@ std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(con
 std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
                                                                              const bool IsMemoryManaged) const
 {
-    std::unique_ptr<GpuFsaTensorHandle> handle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
+    std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo);
     if (!IsMemoryManaged)
     {
         ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed.";
     }
-    return handle;
+    tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+    return tensorHandle;
 }
 
 std::unique_ptr<ITensorHandle> GpuFsaTensorHandleFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
                                                                              DataLayout dataLayout,
                                                                              const bool IsMemoryManaged) const
 {
-    IgnoreUnused(dataLayout);
-    std::unique_ptr<GpuFsaTensorHandle> handle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
+    std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, dataLayout);
     if (!IsMemoryManaged)
     {
         ARMNN_LOG(warning) << "GpuFsaTensorHandleFactory only has support for memory managed.";
     }
-    return handle;
+    tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+    return tensorHandle;
+}
+
+const FactoryId& GpuFsaTensorHandleFactory::GetIdStatic()
+{
+    static const FactoryId s_Id(GpuFsaTensorHandleFactoryId());
+    return s_Id;
 }
 
 const FactoryId& GpuFsaTensorHandleFactory::GetId() const
@@ -71,7 +96,7 @@ const FactoryId& GpuFsaTensorHandleFactory::GetId() const
 
 bool GpuFsaTensorHandleFactory::SupportsSubTensors() const
 {
-    return false;
+    return true;
 }
 
 MemorySourceFlags GpuFsaTensorHandleFactory::GetExportFlags() const
diff --git a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
index 9f88de598b..93a44259f6 100644
--- a/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
+++ b/src/backends/gpuFsa/GpuFsaTensorHandleFactory.hpp
@@ -1,14 +1,13 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
-
 #pragma once
 
-#include "GpuFsaMemoryManager.hpp"
-
 #include <armnn/backends/ITensorHandleFactory.hpp>
 
+#include <aclCommon/BaseMemoryManager.hpp>
+
 namespace armnn
 {
 
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
index 687c8c0ac8..6d13879f51 100644
--- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.cpp
@@ -1,10 +1,10 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+
 #include <Layer.hpp>
-#include <armnn/backends/MemCopyWorkload.hpp>
-#include <armnn/backends/TensorHandle.hpp>
+
 #include "GpuFsaWorkloadFactory.hpp"
 #include "GpuFsaBackendId.hpp"
 #include "GpuFsaTensorHandle.hpp"
@@ -17,11 +17,9 @@ namespace
 static const BackendId s_Id{GpuFsaBackendId()};
 }
 template <typename QueueDescriptorType>
-std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& descriptor,
-                                                               const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::MakeWorkload(const QueueDescriptorType& /*descriptor*/,
+                                                               const WorkloadInfo& /*info*/) const
 {
-    IgnoreUnused(descriptor);
-    IgnoreUnused(info);
     return nullptr;
 }
 
@@ -64,51 +62,29 @@ bool GpuFsaWorkloadFactory::IsLayerSupported(const Layer& layer,
     return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported);
 }
 
-bool GpuFsaWorkloadFactory::IsLayerSupported(const IConnectableLayer& layer,
-                                             Optional<DataType> dataType,
-                                             std::string& outReasonIfUnsupported,
-                                             const ModelOptions& modelOptions)
-{
-    return IWorkloadFactory::IsLayerSupported(s_Id, layer, dataType, outReasonIfUnsupported, modelOptions);
-}
-
 std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
-                                                                         const bool isMemoryManaged) const
+                                                                         const bool /*isMemoryManaged*/) const
 {
-    if (isMemoryManaged)
-    {
-        return std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
-    }
-    else
-    {
-        return std::make_unique<GpuFsaTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
-    }
+    std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo);
+    tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+
+    return tensorHandle;
 }
 
 std::unique_ptr<ITensorHandle> GpuFsaWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
                                                                          DataLayout dataLayout,
-                                                                         const bool isMemoryManaged) const
+                                                                         const bool /*isMemoryManaged*/) const
 {
-    IgnoreUnused(dataLayout);
+    std::unique_ptr<GpuFsaTensorHandle> tensorHandle = std::make_unique<GpuFsaTensorHandle>(tensorInfo, dataLayout);
+    tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
 
-    if (isMemoryManaged)
-    {
-        return std::make_unique<GpuFsaTensorHandle>(tensorInfo, m_MemoryManager);
-    }
-    else
-    {
-        return std::make_unique<GpuFsaTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
-    }
+    return tensorHandle;
 }
 
-std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType type,
-                                                                 const QueueDescriptor& descriptor,
-                                                                 const WorkloadInfo& info) const
+std::unique_ptr<IWorkload> GpuFsaWorkloadFactory::CreateWorkload(LayerType /*type*/,
+                                                                 const QueueDescriptor& /*descriptor*/,
+                                                                 const WorkloadInfo& /*info*/) const
 {
-    IgnoreUnused(type);
-    IgnoreUnused(descriptor);
-    IgnoreUnused(info);
-
     return nullptr;
 }
 
diff --git a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
index 0d80f0363c..9b97070766 100644
--- a/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
+++ b/src/backends/gpuFsa/GpuFsaWorkloadFactory.hpp
@@ -1,14 +1,12 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
 
-#include "GpuFsaMemoryManager.hpp"
+#include <aclCommon/BaseMemoryManager.hpp>
 
 #include <armnn/Optional.hpp>
-#include <armnn/backends/WorkloadFactory.hpp>
-#include <armnn/utility/IgnoreUnused.hpp>
 
 namespace armnn
 {
@@ -28,19 +26,13 @@ public:
                                  Optional<DataType> dataType,
                                  std::string& outReasonIfUnsupported);
 
-    static bool IsLayerSupported(const IConnectableLayer& layer,
-                                 Optional<DataType> dataType,
-                                 std::string& outReasonIfUnsupported,
-                                 const ModelOptions& modelOptions);
-
     bool SupportsSubTensors() const override { return false; }
 
     ARMNN_DEPRECATED_MSG("Use ITensorHandleFactory::CreateSubTensorHandle instead")
-    std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& parent,
-                                                         TensorShape const& subTensorShape,
-                                                         unsigned int const* subTensorOrigin) const override
+    std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& /*parent*/,
+                                                         TensorShape const& /*subTensorShape*/,
+                                                         unsigned int const* /*subTensorOrigin*/) const override
     {
-        IgnoreUnused(parent, subTensorShape, subTensorOrigin);
         return nullptr;
     }
 
diff --git a/src/backends/gpuFsa/backend.cmake b/src/backends/gpuFsa/backend.cmake
index 589af19c22..2f4f5fbc7b 100644
--- a/src/backends/gpuFsa/backend.cmake
+++ b/src/backends/gpuFsa/backend.cmake
@@ -1,12 +1,12 @@
 #
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 #
 
 add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/gpuFsa)
 list(APPEND armnnLibraries armnnGpuFsaBackend)
 
-if(ARMNNGPUFSA)
+if(ARMCOMPUTEGPUFSA)
     list(APPEND armnnLibraries armnnGpuFsaBackendWorkloads)
     list(APPEND armnnUnitTestLibraries armnnGpuFsaBackendUnitTests)
 else()
diff --git a/src/backends/gpuFsa/backend.mk b/src/backends/gpuFsa/backend.mk
index 840e10338c..78ba7ba167 100644
--- a/src/backends/gpuFsa/backend.mk
+++ b/src/backends/gpuFsa/backend.mk
@@ -1,5 +1,5 @@
 #
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 #
 
@@ -8,23 +8,23 @@
 # file in the root of ArmNN
 
 # The variable to enable/disable the GPU Dynamic Fusion backend
-# (ARMNN_GPU_FSA_ENABLED is declared in android-nn-driver/Android.mk)
-ifeq ($(ARMNN_GPU_FSA_ENABLED),1)
+# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk)
+ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1)
 
-# ARMNN_GPU_FSA_ENABLED == 1
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 1
 # Include the source files for the GPU Dynamic Fusion backend
 
 BACKEND_SOURCES := \
         GpuFsaBackend.cpp \
+        GpuFsaBackendContext.cpp \
+        GpuFsaContextControl.cpp \
         GpuFsaLayerSupport.cpp \
-        GpuFsaMemoryManager.cpp \
         GpuFsaRegistryInitializer.cpp \
-        GpuFsaTensorHandle.cpp \
         GpuFsaTensorHandleFactory.cpp \
         GpuFsaWorkloadFactory.cpp
 else
 
-# ARMNN_GPU_FSA_ENABLED == 0
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
 # No source file will be compiled for the GPU Dynamic Fusion backend
 
 BACKEND_SOURCES :=
@@ -36,10 +36,10 @@ endif
 # up by the Android.mk file in the root of ArmNN
 
 # The variable to enable/disable the GPU Dynamic Fusion backend
-# (ARMNN_GPU_FSA_ENABLED is declared in android-nn-driver/Android.mk)
-ifeq ($(ARMNN_GPU_FSA_ENABLED),1)
+# (ARMNN_COMPUTE_GPUFSA_ENABLED is declared in android-nn-driver/Android.mk)
+ifeq ($(ARMNN_COMPUTE_GPUFSA_ENABLED),1)
 
-# ARMNN_GPU_FSA_ENABLED == 1
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 1
 # Include the source files for the GPU Dynamic Fusion backend tests
 
 BACKEND_TEST_SOURCES := \
@@ -49,7 +49,7 @@ BACKEND_TEST_SOURCES := \
         test/GpuFsaOptimizedNetworkTests.cpp
 else
 
-# ARMNN_GPU_FSA_ENABLED == 0
+# ARMNN_COMPUTE_GPUFSA_ENABLED == 0
 # No source file will be compiled for the GPU Dynamic Fusion backend tests
 
 BACKEND_TEST_SOURCES :=
diff --git a/src/backends/gpuFsa/test/CMakeLists.txt b/src/backends/gpuFsa/test/CMakeLists.txt
index c600589768..66091e90df 100644
--- a/src/backends/gpuFsa/test/CMakeLists.txt
+++ b/src/backends/gpuFsa/test/CMakeLists.txt
@@ -1,9 +1,10 @@
 #
-# Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 #
 
 list(APPEND armnnGpuFsaBackendUnitTests_sources
+        GpuFsaDefaultAllocatorTests.cpp
         GpuFsaEndToEndTests.cpp
         GpuFsaLayerTests.cpp
         GpuFsaLayerSupportTests.cpp
diff --git a/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp
new file mode 100644
index 0000000000..1f603e2718
--- /dev/null
+++ b/src/backends/gpuFsa/test/GpuFsaDefaultAllocatorTests.cpp
@@ -0,0 +1,193 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/backends/ICustomAllocator.hpp>
+#include <armnn/BackendRegistry.hpp>
+#include <armnn/Descriptors.hpp>
+#include <armnn/Exceptions.hpp>
+#include <armnn/IRuntime.hpp>
+#include <armnn/backends/TensorHandle.hpp>
+// Requires the OpenCl backend to be included (GpuFsa)
+#include <gpuFsa/GpuFsaBackend.hpp>
+#include <doctest/doctest.h>
+#include <backendsCommon/DefaultAllocator.hpp>
+#include <armnnTestUtils/MockBackend.hpp>
+#include <gpuFsa/GpuFsaBackendDefaultAllocator.hpp>
+
+using namespace armnn;
+
+namespace
+{
+
+TEST_SUITE("DefaultAllocatorTests")
+{
+
+TEST_CASE("DefaultAllocatorTest")
+{
+    float number = 3;
+
+    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+
+    // Create ArmNN runtime
+    IRuntime::CreationOptions options; // default options
+    auto customAllocator = std::make_shared<DefaultAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+    IRuntimePtr run = IRuntime::Create(options);
+
+    // Creates structures for input & output
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+    CHECK(inputPtr[0] == 3);
+
+    auto& backendRegistry = armnn::BackendRegistryInstance();
+    backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("DefaultAllocatorTestMulti")
+{
+    float number = 3;
+
+    TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32);
+
+    // Create ArmNN runtime
+    IRuntime::CreationOptions options; // default options
+    auto customAllocator = std::make_shared<DefaultAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+    IRuntimePtr run = IRuntime::Create(options);
+
+    // Creates structures for input & output
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+    void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+    CHECK(inputPtr[0] == 3);
+    CHECK(inputPtr[1] == 3);
+
+    auto* inputPtr2 = reinterpret_cast<float*>(alignedInputPtr2);
+    std::fill_n(inputPtr2, numElements, number);
+    CHECK(inputPtr2[0] == 3);
+    CHECK(inputPtr2[1] == 3);
+
+    // No overlap
+    CHECK(inputPtr[0] == 3);
+    CHECK(inputPtr[1] == 3);
+
+    auto& backendRegistry = armnn::BackendRegistryInstance();
+    backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("DefaultAllocatorTestMock")
+{
+    // Create ArmNN runtime
+    IRuntime::CreationOptions options; // default options
+    IRuntimePtr run = IRuntime::Create(options);
+
+    // Initialize Mock Backend
+    MockBackendInitialiser initialiser;
+    auto factoryFun = BackendRegistryInstance().GetFactory(MockBackend().GetIdStatic());
+    ARMNN_ASSERT(factoryFun != nullptr);
+    auto backend = factoryFun();
+    auto defaultAllocator = backend->GetDefaultAllocator();
+
+    // GetMemorySourceType
+    CHECK(defaultAllocator->GetMemorySourceType() == MemorySource::Malloc);
+
+    size_t totalBytes = 1 * sizeof(float);
+    // Allocate
+    void* ptr = defaultAllocator->allocate(totalBytes, 0);
+
+    // GetMemoryRegionAtOffset
+    CHECK(defaultAllocator->GetMemoryRegionAtOffset(ptr, 0, 0));
+
+    // Free
+    defaultAllocator->free(ptr);
+
+    // Clean up
+    auto& backendRegistry = armnn::BackendRegistryInstance();
+    backendRegistry.Deregister(MockBackend().GetIdStatic());
+    backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+}
+
+
+TEST_SUITE("GpuFsaDefaultAllocatorTests")
+{
+
+TEST_CASE("GpuFsaDefaultAllocatorTest")
+{
+    float number = 3;
+
+    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+
+    // Create ArmNN runtime
+    IRuntime::CreationOptions options; // default options
+    auto customAllocator = std::make_shared<GpuFsaBackendDefaultAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+    IRuntimePtr run = IRuntime::Create(options);
+
+    // Creates structures for input & output
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+    CHECK(inputPtr[0] == 3);
+
+    auto& backendRegistry = armnn::BackendRegistryInstance();
+    backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+TEST_CASE("GpuFsaDefaultAllocatorTestMulti")
+{
+    float number = 3;
+
+    TensorInfo inputTensorInfo(TensorShape({2, 1}), DataType::Float32);
+
+    // Create ArmNN runtime
+    IRuntime::CreationOptions options; // default options
+    auto customAllocator = std::make_shared<GpuFsaBackendDefaultAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuFsa", std::move(customAllocator)}};
+    IRuntimePtr run = IRuntime::Create(options);
+
+    // Creates structures for input & output
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+    void* alignedInputPtr2 = options.m_CustomAllocatorMap["GpuFsa"]->allocate(totalBytes, 0);
+
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+    CHECK(inputPtr[0] == 3);
+    CHECK(inputPtr[1] == 3);
+
+    auto* inputPtr2 = reinterpret_cast<float*>(alignedInputPtr2);
+    std::fill_n(inputPtr2, numElements, number);
+    CHECK(inputPtr2[0] == 3);
+    CHECK(inputPtr2[1] == 3);
+
+    // No overlap
+    CHECK(inputPtr[0] == 3);
+    CHECK(inputPtr[1] == 3);
+
+    auto& backendRegistry = armnn::BackendRegistryInstance();
+    backendRegistry.DeregisterAllocator(GpuFsaBackend::GetIdStatic());
+}
+
+}
+
+} // namespace armnn
\ No newline at end of file
-- 
cgit v1.2.1