From 55a8ffda24fff5515803df10fb4863d46a1effdf Mon Sep 17 00:00:00 2001
From: Mike Kelly <mike.kelly@arm.com>
Date: Wed, 7 Apr 2021 20:10:49 +0100
Subject: IVGCVSW-5823 Refactor Async Network API

 * Moved IAsyncNetwork into IRuntime.
 * All LoadedNetworks can be executed Asynchronously.

Signed-off-by: Mike Kelly <mike.kelly@arm.com>
Change-Id: Ibbc901ab9110dc2f881425b75489bccf9ad54169
---
 Android.mk                                         |   1 -
 CMakeLists.txt                                     |   3 -
 include/armnn/ArmNN.hpp                            |   2 +-
 include/armnn/IAsyncNetwork.hpp                    |  64 --
 include/armnn/IRuntime.hpp                         |  34 +-
 include/armnn/IWorkingMemHandle.hpp                |   5 +
 include/armnn/NetworkFwd.hpp                       |   7 -
 src/armnn/AsyncNetwork.cpp                         | 665 ---------------------
 src/armnn/AsyncNetwork.hpp                         | 106 ----
 src/armnn/LoadedNetwork.cpp                        | 362 ++++++++++-
 src/armnn/LoadedNetwork.hpp                        |  33 +-
 src/armnn/Runtime.cpp                              | 135 +++--
 src/armnn/Runtime.hpp                              |  20 +-
 src/armnn/WorkingMemHandle.cpp                     |   4 +-
 src/armnn/WorkingMemHandle.hpp                     |  12 +-
 .../test/StridedSliceAsyncEndToEndTest.hpp         |  12 +-
 16 files changed, 528 insertions(+), 937 deletions(-)
 delete mode 100644 include/armnn/IAsyncNetwork.hpp
 delete mode 100644 src/armnn/AsyncNetwork.cpp
 delete mode 100644 src/armnn/AsyncNetwork.hpp
diff --git a/Android.mk b/Android.mk
index 806d81bcd5..416c00238c 100644
--- a/Android.mk
+++ b/Android.mk
@@ -108,7 +108,6 @@ LOCAL_SRC_FILES := \
         profiling/server/src/timelineDecoder/TimelineCaptureCommandHandler.cpp \
         profiling/server/src/timelineDecoder/TimelineDecoder.cpp \
         profiling/server/src/timelineDecoder/TimelineDirectoryCaptureCommandHandler.cpp \
-        src/armnn/AsyncNetwork.cpp \
         src/armnn/BackendHelper.cpp \
         src/armnn/BackendRegistry.cpp \
         src/armnn/Descriptors.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62417bebb3..049a4f1e1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,7 +242,6 @@ list(APPEND armnn_sources
     include/armnn/Descriptors.hpp
     include/armnn/DescriptorsFwd.hpp
     include/armnn/Exceptions.hpp
-    include/armnn/IAsyncNetwork.hpp
     include/armnn/ILayerSupport.hpp
     include/armnn/ILayerVisitor.hpp
     include/armnn/INetwork.hpp
@@ -408,8 +407,6 @@ list(APPEND armnn_sources
     src/armnn/layers/TransposeLayer.cpp
     src/armnn/layers/UnmapLayer.cpp
     src/armnn/layers/UnmapLayer.hpp
-    src/armnn/AsyncNetwork.cpp
-    src/armnn/AsyncNetwork.hpp
     src/armnn/BackendRegistry.cpp
     src/armnn/BackendSettings.hpp
     src/armnn/BackendHelper.cpp
diff --git a/include/armnn/ArmNN.hpp b/include/armnn/ArmNN.hpp
index ac4d33f737..e4d5ce1fa1 100644
--- a/include/armnn/ArmNN.hpp
+++ b/include/armnn/ArmNN.hpp
@@ -7,9 +7,9 @@
 #include "BackendId.hpp"
 #include "Descriptors.hpp"
 #include "Exceptions.hpp"
-#include "IAsyncNetwork.hpp"
 #include "INetwork.hpp"
 #include "IRuntime.hpp"
+#include "IWorkingMemHandle.hpp"
 #include "LstmParams.hpp"
 #include "Optional.hpp"
 #include "QuantizedLstmParams.hpp"
diff --git a/include/armnn/IAsyncNetwork.hpp b/include/armnn/IAsyncNetwork.hpp
deleted file mode 100644
index c234ae55ac..0000000000
--- a/include/armnn/IAsyncNetwork.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//
-// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include <armnn/NetworkFwd.hpp>
-
-#include "INetwork.hpp"
-#include "IProfiler.hpp"
-#include "IWorkingMemHandle.hpp"
-#include "Tensor.hpp"
-#include "Types.hpp"
-
-#include <mutex>
-
-namespace armnn
-{
-struct INetworkProperties;
-
-namespace profiling
-{
-class ProfilingService;
-}
-
-namespace experimental
-{
-class AsyncNetworkImpl;
-
-class IAsyncNetwork
-{
-public:
-    IAsyncNetwork(std::unique_ptr<IOptimizedNetwork> net,
-                  const INetworkProperties& networkProperties,
-                  profiling::ProfilingService& profilingService);
-    ~IAsyncNetwork();
-
-    TensorInfo GetInputTensorInfo(LayerBindingId layerId) const;
-    TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const;
-
-    /// Thread safe execution of the network. Returns once execution is complete.
-    /// Will block until this and any other thread using the same workingMem object completes.
-    Status Execute(const InputTensors& inputTensors,
-                   const OutputTensors& outputTensors,
-                   IWorkingMemHandle& workingMemHandle);
-
-    /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
-    /// overlapped Execution by calling this function from different threads.
-    std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle();
-
-    /// Get the profiler used for this network
-    std::shared_ptr<IProfiler> GetProfiler() const;
-
-    /// Register a debug callback function to be used with this network
-    void RegisterDebugCallback(const DebugCallbackFunction& func);
-
-private:
-    std::unique_ptr<AsyncNetworkImpl> pAsyncNetworkImpl;
-};
-
-} // end experimental namespace
-
-} // end armnn namespace
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp
index 9f7032914f..fc203e67e4 100644
--- a/include/armnn/IRuntime.hpp
+++ b/include/armnn/IRuntime.hpp
@@ -5,9 +5,9 @@
 #pragma once
 
 #include "BackendOptions.hpp"
-#include "IAsyncNetwork.hpp"
 #include "INetwork.hpp"
 #include "IProfiler.hpp"
+#include "IWorkingMemHandle.hpp"
 #include "Tensor.hpp"
 #include "Types.hpp"
 #include "TypesUtils.hpp"
@@ -28,12 +28,14 @@ using IRuntimePtr = std::unique_ptr<IRuntime, void(*)(IRuntime* runtime)>;
 
 struct INetworkProperties
 {
-    INetworkProperties(bool importEnabled = false, bool exportEnabled = false)
+    INetworkProperties(bool importEnabled = false, bool exportEnabled = false, bool asyncEnabled = false)
         : m_ImportEnabled(importEnabled),
-          m_ExportEnabled(exportEnabled) {}
+          m_ExportEnabled(exportEnabled),
+          m_AsyncEnabled(asyncEnabled) {}
 
     const bool m_ImportEnabled;
     const bool m_ExportEnabled;
+    const bool m_AsyncEnabled;
 
     virtual ~INetworkProperties() {}
 };
@@ -145,20 +147,6 @@ public:
                        std::string& errorMessage,
                        const INetworkProperties& networkProperties);
 
-    /// This is an experimental function.
-    /// Creates an executable network. This network is thread safe allowing for multiple networks to be
-    /// loaded simultaneously via different threads.
-    /// Note that the network is never registered with the runtime so does not need to be 'Unloaded'.
-    /// @param [out] networkIdOut Unique identifier for the network is returned in this reference.
-    /// @param [in] network Complete network to load into the IRuntime.
-    /// @param [out] errorMessage Error message if there were any errors.
-    /// @param [out] networkProperties the INetworkProperties that govern how the network should operate.
-    /// @return The IAsyncNetwork
-    std::unique_ptr<IAsyncNetwork> CreateAsyncNetwork(NetworkId& networkIdOut,
-                                                      IOptimizedNetworkPtr network,
-                                                      std::string& errorMessage,
-                                                      const INetworkProperties& networkProperties);
-
     TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
     TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
 
@@ -167,6 +155,14 @@ public:
                            const InputTensors& inputTensors,
                            const OutputTensors& outputTensors);
 
+    /// This is an experimental function.
+    /// Evaluates a network using input in inputTensors and outputs filled into outputTensors.
+    /// This function performs a thread safe execution of the network. Returns once execution is complete.
+    /// Will block until this and any other thread using the same workingMem object completes.
+    Status Execute(IWorkingMemHandle& workingMemHandle,
+                   const InputTensors& inputTensors,
+                   const OutputTensors& outputTensors);
+
     /// Unloads a network from the IRuntime.
     /// At the moment this only removes the network from the m_Impl->m_Network.
     /// This might need more work in the future to be AndroidNN compliant.
@@ -176,6 +172,10 @@ public:
 
     const IDeviceSpec& GetDeviceSpec() const;
 
+    /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+    /// overlapped Execution by calling this function from different threads.
+    std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle(NetworkId networkId);
+
     /// Gets the profiler corresponding to the given network id.
     /// @param networkId The id of the network for which to get the profile.
     /// @return A pointer to the requested profiler, or nullptr if not found.
diff --git a/include/armnn/IWorkingMemHandle.hpp b/include/armnn/IWorkingMemHandle.hpp
index 921b7e1f40..171fa3d81c 100644
--- a/include/armnn/IWorkingMemHandle.hpp
+++ b/include/armnn/IWorkingMemHandle.hpp
@@ -10,6 +10,8 @@
 namespace armnn
 {
 
+using NetworkId = int;
+
 namespace experimental
 {
 
@@ -20,6 +22,9 @@ class IWorkingMemHandle
 public:
     virtual ~IWorkingMemHandle() {};
 
+    /// Returns the NetworkId of the Network that this IWorkingMemHandle works with.
+    virtual NetworkId GetNetworkId() = 0;
+
     /// Allocate the backing memory required for execution. If this is not called, then allocation will be
     /// deferred to execution time. The mutex must be locked.
     virtual void Allocate() = 0;
diff --git a/include/armnn/NetworkFwd.hpp b/include/armnn/NetworkFwd.hpp
index 6c2970f28b..5db9ec4ebe 100644
--- a/include/armnn/NetworkFwd.hpp
+++ b/include/armnn/NetworkFwd.hpp
@@ -10,13 +10,6 @@ namespace armnn
 struct LstmInputParams;
 struct QuantizedLstmInputParams;
 
-namespace experimental
-{
-
-class IAsyncNetwork;
-
-} // end experimental namespace
-
 class INetwork;
 class IOptimizedNetwork;
 class Graph;
diff --git a/src/armnn/AsyncNetwork.cpp b/src/armnn/AsyncNetwork.cpp
deleted file mode 100644
index 230346a0c3..0000000000
--- a/src/armnn/AsyncNetwork.cpp
+++ /dev/null
@@ -1,665 +0,0 @@
-//
-// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include "AsyncNetwork.hpp"
-#include "Graph.hpp"
-#include "Layer.hpp"
-#include "Profiling.hpp"
-
-#include <armnn/BackendHelper.hpp>
-#include <armnn/BackendRegistry.hpp>
-#include <armnn/Logging.hpp>
-#include <armnn/utility/Assert.hpp>
-
-#include <armnn/backends/IMemoryManager.hpp>
-#include <backendsCommon/CpuTensorHandle.hpp>
-#include <backendsCommon/WorkloadData.hpp>
-#include <backendsCommon/MemCopyWorkload.hpp>
-#include <LabelsAndEventClasses.hpp>
-
-#include <fmt/format.h>
-
-namespace armnn
-{
-
-namespace experimental
-{
-
-IAsyncNetwork::IAsyncNetwork(std::unique_ptr<IOptimizedNetwork> net,
-                             const INetworkProperties& networkProperties,
-                             profiling::ProfilingService& profilingService)
-       : pAsyncNetworkImpl( new AsyncNetworkImpl(std::move(net), networkProperties, profilingService)) {};
-
-IAsyncNetwork::~IAsyncNetwork() = default;
-
-TensorInfo IAsyncNetwork::GetInputTensorInfo(LayerBindingId layerId) const
-{
-    return pAsyncNetworkImpl->GetInputTensorInfo(layerId);
-}
-
-TensorInfo IAsyncNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
-{
-    return pAsyncNetworkImpl->GetOutputTensorInfo(layerId);
-}
-
-Status IAsyncNetwork::Execute(const InputTensors& inputTensors,
-                              const OutputTensors& outputTensors,
-                              IWorkingMemHandle& workingMemHandle)
-{
-    return pAsyncNetworkImpl->Execute(inputTensors, outputTensors, workingMemHandle);
-}
-
-std::unique_ptr<IWorkingMemHandle> IAsyncNetwork::CreateWorkingMemHandle()
-{
-    return pAsyncNetworkImpl->CreateWorkingMemHandle();
-}
-
-std::shared_ptr<IProfiler> IAsyncNetwork::GetProfiler() const
-{
-    return pAsyncNetworkImpl->GetProfiler();
-}
-
-void IAsyncNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
-{
-    pAsyncNetworkImpl->RegisterDebugCallback(func);
-}
-
-void AddLayerStructure(std::unique_ptr<profiling::TimelineUtilityMethods>& timelineUtils,
-                       const Layer& layer,
-                       profiling::ProfilingGuid networkGuid)
-{
-    // Add layer to the post-optimisation network structure
-    std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
-    timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
-                                               networkGuid,
-                                               layerName,
-                                               profiling::LabelsAndEventClasses::LAYER_GUID);
-    for (auto&& input : layer.GetInputSlots())
-    {
-        const IOutputSlot* source = input.GetConnectedOutputSlot();
-        ARMNN_ASSERT(source != NULL);
-        timelineUtils->CreateConnectionRelationship(profiling::ProfilingRelationshipType::RetentionLink,
-                                                    source->GetOwningLayerGuid(),
-                                                    layer.GetGuid());
-    }
-}
-
-void AddWorkloadStructure(std::unique_ptr<profiling::TimelineUtilityMethods>& timelineUtils,
-                          std::unique_ptr<IWorkload>& workload,
-                          const Layer& layer)
-{
-    // Add workload to the post-optimisation network structure
-    timelineUtils->CreateTypedEntity(workload->GetGuid(), profiling::LabelsAndEventClasses::WORKLOAD_GUID);
-    timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
-                                       layer.GetBackendId().Get(),
-                                       profiling::LabelsAndEventClasses::BACKENDID_GUID);
-
-    // Link the workload to the layer
-    timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink,
-                                      layer.GetGuid(),
-                                      workload->GetGuid(),
-                                      profiling::LabelsAndEventClasses::CHILD_GUID);
-}
-
-TensorInfo AsyncNetworkImpl::GetInputTensorInfo(LayerBindingId layerId) const
-{
-    for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers())
-    {
-        ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
-        if (inputLayer->GetBindingId() == layerId)
-        {
-            return inputLayer->GetOutputSlot(0).GetTensorInfo();
-        }
-    }
-
-    throw InvalidArgumentException(fmt::format("No input layer is associated with id {0}}", layerId));
-}
-
-TensorInfo AsyncNetworkImpl::GetOutputTensorInfo(LayerBindingId layerId) const
-{
-    for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers())
-    {
-        ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
-        ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
-        if (outputLayer->GetBindingId() == layerId)
-        {
-            return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
-        }
-    }
-
-    throw InvalidArgumentException(fmt::format("No output layer is associated with id {0}}", layerId));
-}
-
-// Need something like the collectors to get the correct tensors for the inputs
-void AsyncNetworkImpl::CollectInputTensorHandles(
-        std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
-        std::vector<ITensorHandle*>& inputs,
-        const armnn::Layer* layer,
-        const TensorHandleFactoryRegistry& registry,
-        const bool isMemoryManaged)
-{
-    for (auto&& inputSlot : layer->GetInputSlots())
-    {
-        // The graph must be well-formed at this point.
-        ARMNN_ASSERT(inputSlot.GetConnection());
-        auto outputSlot = inputSlot.GetConnectedOutputSlot();
-        auto key = outputSlot->GetOwningLayer().GetGuid();
-        auto search = tensorHandles.find(key);
-
-        if (search == tensorHandles.end())
-        {
-            ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
-            const TensorInfo& tensorInfo = outputSlot->GetTensorInfo();
-
-            ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
-            ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
-            ARMNN_ASSERT(handleFactory);
-            std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
-            ITensorHandle* tensorPtr = tensor.release();
-            inputs.push_back(tensorPtr);
-        }
-        else
-        {
-            unsigned int index = outputSlot->CalculateIndexOnOwner();
-            inputs.push_back(search->second[index]);
-        }
-    }
-}
-
-void AsyncNetworkImpl::CreateOutputTensorHandles(
-        std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
-        std::vector<ITensorHandle*>& outputs,
-        const armnn::Layer* layer,
-        const TensorHandleFactoryRegistry& registry,
-        const bool isMemoryManaged)
-{
-    auto guid = layer->GetGuid();
-    std::vector<ITensorHandle*> tensorHandleVectors;
-    tensorHandleVectors.reserve(layer->GetNumOutputSlots());
-
-    for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++)
-    {
-        const OutputSlot& slot = layer->GetOutputSlot(idx);
-        ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId();
-        const TensorInfo& tensorInfo = slot.GetTensorInfo();
-
-        ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
-        ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
-        ARMNN_ASSERT(handleFactory);
-        std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
-        ITensorHandle* tensorPtr = tensor.release();
-        outputs.push_back(tensorPtr);
-        tensorHandleVectors.push_back(tensorPtr);
-    }
-    tensorHandles.insert({guid, tensorHandleVectors});
-}
-
-const IWorkloadFactory& AsyncNetworkImpl::GetWorkloadFactory(const Layer& layer) const
-{
-    const IWorkloadFactory* workloadFactory = nullptr;
-
-    auto it = m_WorkloadFactories.find(layer.GetBackendId());
-    if (it == m_WorkloadFactories.end())
-    {
-        throw RuntimeException(
-                        fmt::format("No workload factory for {0} to be used for layer: {1}}",
-                                    layer.GetBackendId().Get(),
-                                    layer.GetNameStr()),
-                                    CHECK_LOCATION());
-    }
-
-    workloadFactory = it->second.first.get();
-
-    ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
-
-    std::string reasonIfUnsupported;
-    ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, {}, reasonIfUnsupported),
-                     "Factory does not support layer");
-    IgnoreUnused(reasonIfUnsupported);
-    return *workloadFactory;
-}
-
-void AsyncNetworkImpl::EnqueueInput(const BindableLayer& layer,
-                                    const ConstTensor& inputTensor,
-                                    WorkingMemHandle& context)
-{
-    if (layer.GetType() != LayerType::Input)
-    {
-        throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
-    }
-    LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid();
-    WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id);
-    ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output");
-
-    MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags();
-    if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
-    {
-        if (CheckFlag(importFlags, MemorySource::Malloc) )
-        {
-            // This assumes a CPU Tensor handle
-            std::unique_ptr<ITensorHandle> tensorHandle =
-                    std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(),
-                                                                      inputTensor.GetMemoryArea());
-
-            void* mem = tensorHandle->Map(false);
-            if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc))
-            {
-                tensorHandle->Unmap();
-                return;
-            }
-            tensorHandle->Unmap();
-            throw MemoryImportException("EnqueueInput: Memory Import failed");
-        }
-        else
-        {
-            throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
-        }
-    }
-    else
-    {
-        std::unique_ptr<ITensorHandle> tensorHandle =
-                std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());
-
-        auto copyFunc = [](void* dst, const void* src, size_t size)
-        {
-            memcpy(dst, src, size);
-        };
-
-        for (const auto& input : descriptor.m_Inputs)
-        {
-            CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc);
-        }
-    }
-}
-
-void AsyncNetworkImpl::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle)
-{
-    if (layer.GetType() != LayerType::Output)
-    {
-        throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
-    }
-    ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
-
-    LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid();
-    WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id);
-
-    ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0];
-    ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
-
-    // Try import the output tensor.
-    // Note: We can only import the output pointer if all of the following  hold true:
-    // a) The imported pointer is aligned sufficiently
-    // b) The tensor has zero padding
-    // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
-    // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
-    // e) m_IsExportEnabled must be set to true
-    if (m_NetworkProperties.m_ExportEnabled &&
-        (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
-    {
-        if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
-        {
-            MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
-            if (CheckFlag(importFlags, MemorySource::Malloc))
-            {
-                std::unique_ptr<ITensorHandle> tensorHandle =
-                        std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(),
-                                                                     outputTensor.GetMemoryArea());
-
-                void* mem = tensorHandle->Map(false);
-                bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc);
-                tensorHandle->Unmap();
-
-                if (importOk)
-                {
-                    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
-                    descriptor.m_Inputs[0]->Map(true);
-                    descriptor.m_Inputs[0]->Unmap();
-                }
-                else
-                {
-                    throw MemoryExportException("EnqueueOutput: Memory Export failed");
-                }
-            }
-            else
-            {
-                throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export");
-            }
-        }
-        else
-        {
-            throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer");
-        }
-    }
-    else
-    {
-        auto copyFunc = [](void* dst, const void* src, size_t size)
-        {
-            memcpy(dst, src, size);
-        };
-
-        std::unique_ptr<ITensorHandle> tensorHandle =
-                std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
-
-        CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc);
-    }
-}
-
-AsyncNetworkImpl::AsyncNetworkImpl(std::unique_ptr<IOptimizedNetwork> net,
-                           const INetworkProperties& networkProperties,
-                           profiling::ProfilingService& profilingService) :
-    m_OptimizedNetwork(std::move(net)),
-    m_NetworkProperties(networkProperties),
-    m_ProfilingService(profilingService)
-{
-    // Create a profiler and register it for the current thread.
-    m_Profiler = std::make_shared<IProfiler>();
-    ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get());
-
-    Graph &order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
-
-    //First create tensor handlers, backends and workload factories.
-    //Handlers are created before workloads are.
-    //Because workload creation can modify some of the handlers,
-    //(for example the splitter and concat layers).
-    for (auto &&layer : order)
-    {
-        auto const &backendId = layer->GetBackendId();
-        if (m_Backends.count(backendId) == 0)
-        {
-            auto createBackend = BackendRegistryInstance().GetFactory(backendId);
-            auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
-
-            IBackendInternal* backend = it.first->second.get();
-
-            if (backend->SupportsTensorAllocatorAPI())
-            {
-                backend->RegisterTensorHandleFactories(m_TensorHandleFactoryRegistry);
-
-                auto workloadFactory = backend->CreateWorkloadFactory(m_TensorHandleFactoryRegistry);
-                m_WorkloadFactories.emplace(
-                        std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
-            }
-            else
-            {
-                IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
-                auto workloadFactory = backend->CreateWorkloadFactory(memoryManager);
-
-                m_WorkloadFactories.emplace(
-                        std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
-            }
-        }
-    }
-
-    // Check backends support BackendCapability::AsyncExecution
-    for (auto const& backend : m_Backends)
-    {
-        if (!IsCapabilitySupported(backend.first, BackendCapability::AsyncExecution))
-        {
-            ARMNN_LOG(warning) << fmt::format("AsyncNetworkImpl() Backend: '{0}' does not support Async Execution. "
-                                              "Will fall back to default implementation.",
-                                              backend.first.Get());
-        }
-
-    }
-
-    profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
-    std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils =
-            profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
-    if (timelineUtils)
-    {
-        timelineUtils->CreateTypedEntity(networkGuid, profiling::LabelsAndEventClasses::NETWORK_GUID);
-    }
-
-    //Then create workloads.
-    for (auto &&layer : order)
-    {
-        if (timelineUtils)
-        {
-            // Add layer to the post-optimisation network structure
-            AddLayerStructure(timelineUtils, *layer, networkGuid);
-        }
-
-        const IWorkloadFactory &workloadFactory = GetWorkloadFactory(*layer);
-
-        switch (layer->GetType())
-        {
-            case LayerType::Input:
-            case LayerType::Output:
-            {
-                // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
-                break;
-            }
-            default:
-            {
-                auto workload = layer->CreateWorkload(workloadFactory);
-
-                if (!workload)
-                {
-                    const char* const layerName =
-                            layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
-                    throw InvalidArgumentException(
-                            fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
-                                        layerName,
-                                        static_cast<int>(layer->GetType()),
-                                        layer->GetBackendId().Get()
-                    ));
-                }
-
-                if (timelineUtils)
-                {
-                    // Add workload to the post-optimisation network structure
-                    AddWorkloadStructure(timelineUtils, workload, *layer);
-                }
-
-                m_WorkloadQueue.push_back(move(workload));
-                // release the constant data in the layer..
-                layer->ReleaseConstantData();
-                break;
-            }
-        }
-    }
-
-    if (timelineUtils)
-    {
-        // Commit to send the post-optimisation network structure
-        timelineUtils->Commit();
-    }
-
-    // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload.
-    // PostAllocationConfiguure will now need to be handled in the ExecuteOn(WorkingMemDescriptor)
-    for (auto &workload : m_WorkloadQueue)
-    {
-        workload->PostAllocationConfigure();
-    }
-}
-
-Status AsyncNetworkImpl::Execute(const InputTensors& inputTensors,
-                             const OutputTensors& outputTensors,
-                             IWorkingMemHandle& iWorkingMemHandle)
-{
-    const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
-
-    // Walk graph to determine the order of execution.
-    if (graph.GetNumLayers() < 2)
-    {
-        ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
-        return Status::Failure;
-    }
-
-    if (graph.GetNumInputs() != inputTensors.size())
-    {
-        throw InvalidArgumentException("Number of inputs provided does not match network.");
-    }
-
-    std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils =
-            profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
-    profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
-    if (timelineUtils)
-    {
-        // Add inference timeline trace if profiling is enabled.
-        profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
-        timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID);
-        timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink,
-                                          networkGuid,
-                                          inferenceGuid,
-                                          profiling::LabelsAndEventClasses::EXECUTION_OF_GUID);
-        timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
-    }
-
-    bool executionSucceeded = true;
-
-    if (timelineUtils)
-    {
-        // Add end of life of the inference timeline if profiling is enabled.
-        timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
-        timelineUtils->Commit();
-    }
-    WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
-    std::lock_guard<std::mutex> lockGuard(workingMemHandle.GetMutex());
-
-    if (!workingMemHandle.IsAllocated())
-    {
-        workingMemHandle.Allocate();
-    }
-
-    {
-        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
-        unsigned int i = 0;
-
-        for (const BindableLayer* inputLayer : graph.GetInputLayers())
-        {
-            EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle);
-            ++i;
-        }
-    }
-
-    auto Fail = [&](const std::exception& error)
-    {
-        ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
-        executionSucceeded = false;
-    };
-    profiling::ProfilingDynamicGuid workloadInferenceID(0);
-
-    try
-    {
-        for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
-        {
-            auto& workload = m_WorkloadQueue[i];
-            if (timelineUtils)
-            {
-                workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
-                                                                                                inferenceGuid);
-            }
-            workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i));
-
-            if (timelineUtils)
-            {
-                timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
-            }
-        }
-    }
-    catch (const RuntimeException& error)
-    {
-        Fail(error);
-    }
-    catch (const std::runtime_error& error)
-    {
-        Fail(error);
-    }
-    // For each output to the network, call EnqueueOutput with the data passed by the user.
-    {
-        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
-        unsigned int i = static_cast<unsigned int>(m_WorkloadQueue.size() - graph.GetNumOutputs());
-
-        for (const BindableLayer* outputLayer : graph.GetOutputLayers())
-        {
-            EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle);
-            ++i;
-        }
-    }
-    return executionSucceeded ? Status::Success : Status::Failure;
-}
-
-/// Get the profiler used for this network
-std::shared_ptr<IProfiler> AsyncNetworkImpl::GetProfiler() const
-{
-    return m_Profiler;
-}
-
-void AsyncNetworkImpl::RegisterDebugCallback(const DebugCallbackFunction& func)
-{
-    for (auto&& workloadPtr: m_WorkloadQueue)
-    {
-        workloadPtr.get()->RegisterDebugCallback(func);
-    }
-}
-
-/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
-/// overlapped Execution by calling this function from different threads.
-std::unique_ptr<IWorkingMemHandle> AsyncNetworkImpl::CreateWorkingMemHandle()
-{
-    Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
-    std::unordered_map<LayerGuid, std::vector<ITensorHandle*> > tensorHandles;
-    std::vector<WorkingMemDescriptor> workingMemDescriptors;
-    std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
-
-    for (auto&& layer : order)
-    {
-        if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output)
-        {
-            continue;
-        }
-        WorkingMemDescriptor workingMemDescriptor;
-        // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
-        // If Export is enabled disable memory management so we can export, otherwise we do a copy
-        if((layer->GetNumOutputSlots() == 1) &&
-           (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
-           (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
-        {
-            CollectInputTensorHandles(tensorHandles,
-                                      workingMemDescriptor.m_Inputs,
-                                      layer,
-                                      m_TensorHandleFactoryRegistry,
-                                      !m_NetworkProperties.m_ExportEnabled);
-            CreateOutputTensorHandles(tensorHandles,
-                                      workingMemDescriptor.m_Outputs,
-                                      layer,
-                                      m_TensorHandleFactoryRegistry,
-                                      !m_NetworkProperties.m_ExportEnabled);
-        }
-        else
-        {
-            CollectInputTensorHandles(tensorHandles,
-                                      workingMemDescriptor.m_Inputs,
-                                      layer,
-                                      m_TensorHandleFactoryRegistry);
-            CreateOutputTensorHandles(tensorHandles,
-                                      workingMemDescriptor.m_Outputs,
-                                      layer,
-                                      m_TensorHandleFactoryRegistry);
-        }
-        workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
-        workingMemDescriptors.push_back(workingMemDescriptor);
-    }
-    return std::make_unique<WorkingMemHandle>(workingMemDescriptors, workingMemDescriptorMap);
-}
-
-void AsyncNetworkImpl::FreeWorkingMemory()
-{
-    // Informs the memory managers to release memory in it's respective memory group
-    for (auto&& workloadFactory : m_WorkloadFactories)
-    {
-        IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
-        if (memoryManager)
-        {
-            memoryManager->Release();
-        }
-    }
-    m_TensorHandleFactoryRegistry.ReleaseMemory();
-}
-
-} // end experimental namespace
-
-} // end armnn namespace
diff --git a/src/armnn/AsyncNetwork.hpp b/src/armnn/AsyncNetwork.hpp
deleted file mode 100644
index 9bdc7eebd7..0000000000
--- a/src/armnn/AsyncNetwork.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//
-// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include <armnn/IAsyncNetwork.hpp>
-#include <armnn/Tensor.hpp>
-#include <armnn/Types.hpp>
-
-#include "LayerFwd.hpp"
-#include "Network.hpp"
-#include "Profiling.hpp"
-#include "WorkingMemHandle.hpp"
-
-#include <armnn/backends/IBackendInternal.hpp>
-#include <backendsCommon/TensorHandleFactoryRegistry.hpp>
-#include <backendsCommon/Workload.hpp>
-#include <backendsCommon/WorkloadFactory.hpp>
-#include <ProfilingService.hpp>
-#include <TimelineUtilityMethods.hpp>
-
-#include <unordered_map>
-
-namespace armnn
-{
-
-namespace experimental
-{
-
-class AsyncNetworkImpl final
-{
-public:
-    using WorkloadQueue = std::vector<std::unique_ptr<IWorkload>>;
-
-    AsyncNetworkImpl(std::unique_ptr<IOptimizedNetwork> net,
-                     const INetworkProperties &networkProperties,
-                     profiling::ProfilingService &profilingService);
-
-    ~AsyncNetworkImpl() { FreeWorkingMemory(); }
-
-    TensorInfo GetInputTensorInfo(LayerBindingId layerId) const;
-    TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const;
-
-    /// Thread safe execution of the network. Returns once execution is complete.
-    /// Will block until this and any other thread using the same workingMem object completes.
-    virtual Status Execute(const InputTensors& inputTensors,
-                           const OutputTensors& outputTensors,
-                           IWorkingMemHandle& workingMemHandle);
-
-    /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
-    /// overlapped Execution by calling this function from different threads.
-    std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle();
-
-    /// Get the profiler used for this network
-    std::shared_ptr<IProfiler> GetProfiler() const;
-
-    /// Register a debug callback function to be used with this network
-    void RegisterDebugCallback(const DebugCallbackFunction& func);
-
-private:
-    void FreeWorkingMemory();
-
-    void CollectInputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
-                                   std::vector<ITensorHandle*>& inputs,
-                                   const armnn::Layer* layer,
-                                   const TensorHandleFactoryRegistry& registry,
-                                   const bool isMemoryManaged = false);
-
-    void CreateOutputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
-                                   std::vector<ITensorHandle*>& outputs,
-                                   const armnn::Layer* layer,
-                                   const TensorHandleFactoryRegistry& registry,
-                                   const bool isMemoryManaged = false);
-
-    void EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& handle);
-
-    void EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle);
-
-    using BackendPtrMap = std::unordered_map<BackendId, IBackendInternalUniquePtr>;
-
-    using WorkloadFactoryWithMemoryManager =
-            std::pair<IBackendInternal::IWorkloadFactoryPtr, IBackendInternal::IMemoryManagerSharedPtr>;
-
-    using WorkloadFactoryMap = std::unordered_map<BackendId, WorkloadFactoryWithMemoryManager>;
-
-    const IWorkloadFactory& GetWorkloadFactory(const Layer& layer) const;
-
-    BackendPtrMap m_Backends;
-    WorkloadFactoryMap m_WorkloadFactories;
-
-    std::unique_ptr<IOptimizedNetwork> m_OptimizedNetwork;
-    INetworkProperties m_NetworkProperties;
-    WorkloadQueue m_WorkloadQueue;
-    std::shared_ptr<IProfiler> m_Profiler;
-
-    TensorHandleFactoryRegistry m_TensorHandleFactoryRegistry;
-
-    /// Profiling Service Instance
-    profiling::ProfilingService& m_ProfilingService;
-};
-
-} // end experimental namespace
-
-} // end armnn namespace
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index ea09231c3c..d75a2021b2 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -10,6 +10,7 @@
 #include <Processes.hpp>
 #include "Profiling.hpp"
 #include "HeapProfiling.hpp"
+#include "WorkingMemHandle.hpp"
 
 #include <armnn/BackendRegistry.hpp>
 #include <armnn/Logging.hpp>
@@ -119,8 +120,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
                              const INetworkProperties& networkProperties,
                              profiling::ProfilingService&  profilingService) :
                              m_OptimizedNetwork(std::move(net)),
-                             m_IsImportEnabled(networkProperties.m_ImportEnabled),
-                             m_IsExportEnabled(networkProperties.m_ExportEnabled),
+                             m_NetworkProperties(networkProperties),
                              m_TensorHandleFactoryRegistry(),
                              m_ProfilingService(profilingService)
 {
@@ -172,7 +172,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
         case LayerType::MemImport:
             {
                 // If IsImportEnabled is true then we need to set IsMemoryManaged to false when creating TensorHandles
-                layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsImportEnabled);
+                layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory,
+                                           !m_NetworkProperties.m_ImportEnabled);
                 break;
             }
         default:
@@ -183,7 +184,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
                    (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
                    (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
                 {
-                    layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsExportEnabled);
+                    layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory,
+                                               !m_NetworkProperties.m_ExportEnabled);
                 }
                 else
                 {
@@ -576,7 +578,7 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens
 
     MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
     bool needMemCopy = true;
-    if (m_IsImportEnabled)  // Try import the input tensor
+    if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
     {
         if(CheckFlag(importFlags, MemorySource::Malloc) )
         {
@@ -647,7 +649,8 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten
     // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
     // e) m_IsExportEnabled must be set to true
     bool needMemCopy = true;
-    if (m_IsExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
+    if (m_NetworkProperties.m_ExportEnabled &&
+        (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
     {
         if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
         {
@@ -792,6 +795,353 @@ bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUti
     return success;
 }
 
+void LoadedNetwork::EnqueueInput(const BindableLayer& layer,
+                                 const ConstTensor& inputTensor,
+                                 WorkingMemHandle& context)
+{
+    if (layer.GetType() != LayerType::Input)
+    {
+        throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
+    }
+    LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid();
+    WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id);
+    ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output");
+
+    MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags();
+    if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
+    {
+        if (CheckFlag(importFlags, MemorySource::Malloc) )
+        {
+            // This assumes a CPU Tensor handle
+            std::unique_ptr<ITensorHandle> tensorHandle =
+                    std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(),
+                                                                      inputTensor.GetMemoryArea());
+
+            void* mem = tensorHandle->Map(false);
+            if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc))
+            {
+                tensorHandle->Unmap();
+                return;
+            }
+            tensorHandle->Unmap();
+            throw MemoryImportException("EnqueueInput: Memory Import failed");
+        }
+        else
+        {
+            throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
+        }
+    }
+    else
+    {
+        std::unique_ptr<ITensorHandle> tensorHandle =
+                std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());
+
+        auto copyFunc = [](void* dst, const void* src, size_t size)
+        {
+            memcpy(dst, src, size);
+        };
+
+        for (const auto& input : descriptor.m_Inputs)
+        {
+            CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc);
+        }
+    }
+}
+
+void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle)
+{
+    if (layer.GetType() != LayerType::Output)
+    {
+        throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
+    }
+    ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
+
+    LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid();
+    WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id);
+
+    ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0];
+    ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
+
+    // Try import the output tensor.
+    // Note: We can only import the output pointer if all of the following  hold true:
+    // a) The imported pointer is aligned sufficiently
+    // b) The tensor has zero padding
+    // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
+    // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
+    // e) m_IsExportEnabled must be set to true
+    if (m_NetworkProperties.m_ExportEnabled &&
+        (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
+    {
+        if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
+        {
+            MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
+            if (CheckFlag(importFlags, MemorySource::Malloc))
+            {
+                std::unique_ptr<ITensorHandle> tensorHandle =
+                        std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(),
+                                                                     outputTensor.GetMemoryArea());
+
+                void* mem = tensorHandle->Map(false);
+                bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc);
+                tensorHandle->Unmap();
+
+                if (importOk)
+                {
+                    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
+                    descriptor.m_Inputs[0]->Map(true);
+                    descriptor.m_Inputs[0]->Unmap();
+                }
+                else
+                {
+                    throw MemoryExportException("EnqueueOutput: Memory Export failed");
+                }
+            }
+            else
+            {
+                throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export");
+            }
+        }
+        else
+        {
+            throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer");
+        }
+    }
+    else
+    {
+        auto copyFunc = [](void* dst, const void* src, size_t size)
+        {
+            memcpy(dst, src, size);
+        };
+
+        std::unique_ptr<ITensorHandle> tensorHandle =
+                std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
+
+        CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc);
+    }
+}
+
+Status LoadedNetwork::Execute(const InputTensors& inputTensors,
+                              const OutputTensors& outputTensors,
+                              IWorkingMemHandle& iWorkingMemHandle)
+{
+    const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
+
+    // Walk graph to determine the order of execution.
+    if (graph.GetNumLayers() < 2)
+    {
+        ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
+        return Status::Failure;
+    }
+
+    if (graph.GetNumInputs() != inputTensors.size())
+    {
+        throw InvalidArgumentException("Number of inputs provided does not match network.");
+    }
+
+    std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils =
+            profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
+    profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
+    if (timelineUtils)
+    {
+        // Add inference timeline trace if profiling is enabled.
+        profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
+        timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID);
+        timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink,
+                                          networkGuid,
+                                          inferenceGuid,
+                                          profiling::LabelsAndEventClasses::EXECUTION_OF_GUID);
+        timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
+    }
+
+    bool executionSucceeded = true;
+
+    if (timelineUtils)
+    {
+        // Add end of life of the inference timeline if profiling is enabled.
+        timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
+        timelineUtils->Commit();
+    }
+    WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
+    std::lock_guard<std::mutex> lockGuard(workingMemHandle.GetMutex());
+
+    if (!workingMemHandle.IsAllocated())
+    {
+        workingMemHandle.Allocate();
+    }
+
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
+        unsigned int i = 0;
+
+        for (const BindableLayer* inputLayer : graph.GetInputLayers())
+        {
+            EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle);
+            ++i;
+        }
+    }
+
+    auto Fail = [&](const std::exception& error)
+    {
+        ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
+        executionSucceeded = false;
+    };
+    profiling::ProfilingDynamicGuid workloadInferenceID(0);
+
+    try
+    {
+        for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
+        {
+            auto& workload = m_WorkloadQueue[i];
+            if (timelineUtils)
+            {
+                workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
+                                                                                                inferenceGuid);
+            }
+            workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i));
+
+            if (timelineUtils)
+            {
+                timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
+            }
+        }
+    }
+    catch (const RuntimeException& error)
+    {
+        Fail(error);
+    }
+    catch (const std::runtime_error& error)
+    {
+        Fail(error);
+    }
+    // For each output to the network, call EnqueueOutput with the data passed by the user.
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
+        unsigned int i = static_cast<unsigned int>(m_WorkloadQueue.size() - graph.GetNumOutputs());
+
+        for (const BindableLayer* outputLayer : graph.GetOutputLayers())
+        {
+            EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle);
+            ++i;
+        }
+    }
+    return executionSucceeded ? Status::Success : Status::Failure;
+}
+// Need something like the collectors to get the correct tensors for the inputs
+void LoadedNetwork::CollectInputTensorHandles(
+        std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+        std::vector<ITensorHandle*>& inputs,
+        const armnn::Layer* layer,
+        const TensorHandleFactoryRegistry& registry,
+        const bool isMemoryManaged)
+{
+    for (auto&& inputSlot : layer->GetInputSlots())
+    {
+        // The graph must be well-formed at this point.
+        ARMNN_ASSERT(inputSlot.GetConnection());
+        auto outputSlot = inputSlot.GetConnectedOutputSlot();
+        auto key = outputSlot->GetOwningLayer().GetGuid();
+        auto search = tensorHandles.find(key);
+
+        if (search == tensorHandles.end())
+        {
+            ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
+            const TensorInfo& tensorInfo = outputSlot->GetTensorInfo();
+
+            ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
+            ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
+            ARMNN_ASSERT(handleFactory);
+            std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+            ITensorHandle* tensorPtr = tensor.release();
+            inputs.push_back(tensorPtr);
+        }
+        else
+        {
+            unsigned int index = outputSlot->CalculateIndexOnOwner();
+            inputs.push_back(search->second[index]);
+        }
+    }
+}
+
+void LoadedNetwork::CreateOutputTensorHandles(
+        std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+        std::vector<ITensorHandle*>& outputs,
+        const armnn::Layer* layer,
+        const TensorHandleFactoryRegistry& registry,
+        const bool isMemoryManaged)
+{
+    auto guid = layer->GetGuid();
+    std::vector<ITensorHandle*> tensorHandleVectors;
+    tensorHandleVectors.reserve(layer->GetNumOutputSlots());
+
+    for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++)
+    {
+        const OutputSlot& slot = layer->GetOutputSlot(idx);
+        ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId();
+        const TensorInfo& tensorInfo = slot.GetTensorInfo();
+
+        ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
+        ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
+        ARMNN_ASSERT(handleFactory);
+        std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+        ITensorHandle* tensorPtr = tensor.release();
+        outputs.push_back(tensorPtr);
+        tensorHandleVectors.push_back(tensorPtr);
+    }
+    tensorHandles.insert({guid, tensorHandleVectors});
+}
+
+/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+/// overlapped Execution by calling this function from different threads.
+std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
+{
+    Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
+    std::unordered_map<LayerGuid, std::vector<ITensorHandle*> > tensorHandles;
+    std::vector<WorkingMemDescriptor> workingMemDescriptors;
+    std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
+
+    for (auto&& layer : order)
+    {
+        if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output)
+        {
+            continue;
+        }
+        WorkingMemDescriptor workingMemDescriptor;
+        // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
+        // If Export is enabled disable memory management so we can export, otherwise we do a copy
+        if((layer->GetNumOutputSlots() == 1) &&
+           (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
+           (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+        {
+            CollectInputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Inputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry,
+                                      !m_NetworkProperties.m_ExportEnabled);
+            CreateOutputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Outputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry,
+                                      !m_NetworkProperties.m_ExportEnabled);
+        }
+        else
+        {
+            CollectInputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Inputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry);
+            CreateOutputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Outputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry);
+        }
+        workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
+        workingMemDescriptors.push_back(workingMemDescriptor);
+    }
+    return std::make_unique<WorkingMemHandle>(networkId,
+                                              workingMemDescriptors,
+                                              workingMemDescriptorMap);
+}
+
 void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
 {
     for (auto&& workloadPtr: m_WorkloadQueue)
diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp
index c7dd37fdea..2bcf5c8c08 100644
--- a/src/armnn/LoadedNetwork.hpp
+++ b/src/armnn/LoadedNetwork.hpp
@@ -37,11 +37,19 @@ public:
     using WorkloadQueue = std::vector< std::unique_ptr<IWorkload> >;
     ~LoadedNetwork(){ FreeWorkingMemory(); }
 
+    /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+    /// overlapped Execution by calling this function from different threads.
+    std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle(NetworkId networkId);
+
     TensorInfo GetInputTensorInfo(LayerBindingId layerId) const;
     TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const;
 
     Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors);
 
+    Status Execute(const InputTensors& inputTensors,
+                   const OutputTensors& outputTensors,
+                   IWorkingMemHandle& workingMemHandle);
+
     static std::unique_ptr<LoadedNetwork> MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
                                                             std::string & errorMessage,
                                                             const INetworkProperties& networkProperties,
@@ -58,6 +66,11 @@ public:
 
     void SendNetworkStructure();
 
+    bool IsAsyncEnabled()
+    {
+        return m_NetworkProperties.m_AsyncEnabled;
+    }
+
     profiling::ProfilingGuid GetNetworkGuid();
 
 private:
@@ -67,14 +80,29 @@ private:
                   const INetworkProperties& networkProperties,
                   profiling::ProfilingService& profilingService);
 
+    void CollectInputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+                                   std::vector<ITensorHandle*>& inputs,
+                                   const armnn::Layer* layer,
+                                   const TensorHandleFactoryRegistry& registry,
+                                   const bool isMemoryManaged = false);
+
+    void CreateOutputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+                                   std::vector<ITensorHandle*>& outputs,
+                                   const armnn::Layer* layer,
+                                   const TensorHandleFactoryRegistry& registry,
+                                   const bool isMemoryManaged = false);
+
     void EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo);
 
     void EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo);
 
+    void EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& handle);
+
+    void EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle);
+
     bool Execute(std::unique_ptr<profiling::TimelineUtilityMethods>& timelineUtils,
                  profiling::ProfilingGuid inferenceGuid);
 
-
     const IWorkloadFactory& GetWorkloadFactory(const Layer& layer) const;
 
     using BackendPtrMap = std::unordered_map<BackendId, IBackendInternalUniquePtr>;
@@ -96,8 +124,7 @@ private:
     mutable std::mutex m_WorkingMemMutex;
 
     bool m_IsWorkingMemAllocated=false;
-    bool m_IsImportEnabled=false;
-    bool m_IsExportEnabled=false;
+    INetworkProperties m_NetworkProperties;
 
     TensorHandleFactoryRegistry m_TensorHandleFactoryRegistry;
 
diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp
index 57aaabd277..91a21d4b53 100644
--- a/src/armnn/Runtime.cpp
+++ b/src/armnn/Runtime.cpp
@@ -64,14 +64,6 @@ Status IRuntime::LoadNetwork(NetworkId& networkIdOut,
     return pRuntimeImpl->LoadNetwork(networkIdOut, std::move(network), errorMessage, networkProperties);
 }
 
-std::unique_ptr<IAsyncNetwork> IRuntime::CreateAsyncNetwork(NetworkId& networkIdOut,
-                                                            IOptimizedNetworkPtr network,
-                                                            std::string& errorMessage,
-                                                            const INetworkProperties& networkProperties)
-{
-    return pRuntimeImpl->CreateAsyncNetwork(networkIdOut, std::move(network), errorMessage, networkProperties);
-}
-
 TensorInfo IRuntime::GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const
 {
     return pRuntimeImpl->GetInputTensorInfo(networkId, layerId);
@@ -89,6 +81,13 @@ Status IRuntime::EnqueueWorkload(NetworkId networkId,
     return pRuntimeImpl->EnqueueWorkload(networkId, inputTensors, outputTensors);
 }
 
+Status IRuntime::Execute(IWorkingMemHandle& workingMemHandle,
+                         const InputTensors& inputTensors,
+                         const OutputTensors& outputTensors)
+{
+    return pRuntimeImpl->Execute(workingMemHandle, inputTensors, outputTensors);
+}
+
 Status IRuntime::UnloadNetwork(NetworkId networkId)
 {
     return pRuntimeImpl->UnloadNetwork(networkId);
@@ -99,6 +98,11 @@ const IDeviceSpec& IRuntime::GetDeviceSpec() const
     return pRuntimeImpl->GetDeviceSpec();
 }
 
+std::unique_ptr<IWorkingMemHandle> IRuntime::CreateWorkingMemHandle(NetworkId networkId)
+{
+    return pRuntimeImpl->CreateWorkingMemHandle(networkId);
+}
+
 const std::shared_ptr<IProfiler> IRuntime::GetProfiler(NetworkId networkId) const
 {
     return pRuntimeImpl->GetProfiler(networkId);
@@ -173,43 +177,6 @@ Status RuntimeImpl::LoadNetwork(NetworkId& networkIdOut,
     return Status::Success;
 }
 
-std::unique_ptr<IAsyncNetwork> RuntimeImpl::CreateAsyncNetwork(NetworkId& networkIdOut,
-                                                               IOptimizedNetworkPtr network,
-                                                               std::string&,
-                                                               const INetworkProperties& networkProperties)
-{
-    IOptimizedNetwork* rawNetwork = network.release();
-
-    networkIdOut = GenerateNetworkId();
-
-    for (auto&& context : m_BackendContexts)
-    {
-        context.second->BeforeLoadNetwork(networkIdOut);
-    }
-
-    unique_ptr<IAsyncNetwork> asyncNetwork = std::make_unique<IAsyncNetwork>(
-            std::unique_ptr<IOptimizedNetwork>(rawNetwork),
-            networkProperties,
-            m_ProfilingService);
-
-    if (!asyncNetwork)
-    {
-        return nullptr;
-    }
-
-    for (auto&& context : m_BackendContexts)
-    {
-        context.second->AfterLoadNetwork(networkIdOut);
-    }
-
-    if (m_ProfilingService.IsProfilingEnabled())
-    {
-        m_ProfilingService.IncrementCounterValue(armnn::profiling::NETWORK_LOADS);
-    }
-
-    return asyncNetwork;
-}
-
 Status RuntimeImpl::UnloadNetwork(NetworkId networkId)
 {
     bool unloadOk = true;
@@ -430,6 +397,17 @@ Status RuntimeImpl::EnqueueWorkload(NetworkId networkId,
                                 const OutputTensors& outputTensors)
 {
     LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId);
+
+    if (!loadedNetwork)
+    {
+        ARMNN_LOG(error) << "A Network with an id of " << networkId << " does not exist.\n";
+        return Status::Failure;
+    }
+    if (loadedNetwork->IsAsyncEnabled())
+    {
+        ARMNN_LOG(error) << "Network " << networkId << " is async enabled.\n";
+        return Status::Failure;
+    }
     ProfilerManager::GetInstance().RegisterProfiler(loadedNetwork->GetProfiler().get());
 
     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload");
@@ -447,6 +425,73 @@ Status RuntimeImpl::EnqueueWorkload(NetworkId networkId,
     return loadedNetwork->EnqueueWorkload(inputTensors, outputTensors);
 }
 
+Status RuntimeImpl::Execute(IWorkingMemHandle& iWorkingMemHandle,
+                            const InputTensors& inputTensors,
+                            const OutputTensors& outputTensors)
+{
+    NetworkId networkId = iWorkingMemHandle.GetNetworkId();
+    LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId);
+
+    if (!loadedNetwork)
+    {
+        ARMNN_LOG(error) << "A Network with an id of " << networkId << " does not exist.\n";
+        return Status::Failure;
+    }
+    if (!loadedNetwork->IsAsyncEnabled())
+    {
+        ARMNN_LOG(error) << "Network " << networkId << " is not async enabled.\n";
+        return Status::Failure;
+    }
+    ProfilerManager::GetInstance().RegisterProfiler(loadedNetwork->GetProfiler().get());
+
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute");
+
+    static thread_local NetworkId lastId = networkId;
+    if (lastId != networkId)
+    {
+        LoadedNetworkFuncSafe(lastId, [](LoadedNetwork* network)
+        {
+            network->FreeWorkingMemory();
+        });
+    }
+    lastId=networkId;
+
+    return loadedNetwork->Execute(inputTensors, outputTensors, iWorkingMemHandle);
+}
+
+/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+/// overlapped Execution by calling this function from different threads.
+std::unique_ptr<IWorkingMemHandle> RuntimeImpl::CreateWorkingMemHandle(NetworkId networkId)
+{
+    LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId);
+
+    if (!loadedNetwork)
+    {
+        ARMNN_LOG(error) << "A Network with an id of " << networkId << " does not exist.\n";
+        return nullptr;
+    }
+    if (!loadedNetwork->IsAsyncEnabled())
+    {
+        ARMNN_LOG(error) << "Network " << networkId << " is not async enabled.\n";
+        return nullptr;
+    }
+    ProfilerManager::GetInstance().RegisterProfiler(loadedNetwork->GetProfiler().get());
+
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CreateWorkingMemHandle");
+
+    static thread_local NetworkId lastId = networkId;
+    if (lastId != networkId)
+    {
+        LoadedNetworkFuncSafe(lastId, [](LoadedNetwork* network)
+        {
+            network->FreeWorkingMemory();
+        });
+    }
+    lastId=networkId;
+
+    return loadedNetwork->CreateWorkingMemHandle(networkId);
+}
+
 void RuntimeImpl::RegisterDebugCallback(NetworkId networkId, const DebugCallbackFunction& func)
 {
     LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId);
diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp
index 150012eb61..da5445383f 100644
--- a/src/armnn/Runtime.hpp
+++ b/src/armnn/Runtime.hpp
@@ -4,7 +4,6 @@
 //
 #pragma once
 
-#include "AsyncNetwork.hpp"
 #include "LoadedNetwork.hpp"
 #include "DeviceSpec.hpp"
 
@@ -56,17 +55,14 @@ public:
     TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
     TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
 
-    // Create Aysnchronous Network from the IOptimizedNetowrkPtr
-    std::unique_ptr<IAsyncNetwork> CreateAsyncNetwork(NetworkId& networkIdOut,
-                                                      IOptimizedNetworkPtr network,
-                                                      std::string& errorMessage,
-                                                      const INetworkProperties& networkProperties);
-
-
     // Evaluates network using input in inputTensors, outputs filled into outputTensors.
     Status EnqueueWorkload(NetworkId networkId,
-        const InputTensors& inputTensors,
-        const OutputTensors& outputTensors);
+                           const InputTensors& inputTensors,
+                           const OutputTensors& outputTensors);
+
+    Status Execute(IWorkingMemHandle& workingMemHandle,
+                   const InputTensors& inputTensors,
+                   const OutputTensors& outputTensors);
 
     /// Unloads a network from the Runtime.
     /// At the moment this only removes the network from the m_Impl->m_Network.
@@ -82,6 +78,10 @@ public:
     /// @return A pointer to the requested profiler, or nullptr if not found.
     const std::shared_ptr<IProfiler> GetProfiler(NetworkId networkId) const;
 
+    /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+    /// overlapped Execution by calling this function from different threads.
+    std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle(NetworkId networkId);
+
     /// Registers a callback function to debug layers performing custom computations on intermediate tensors.
     /// @param networkId The id of the network to register the callback.
     /// @param func callback function to pass to the debug layer.
diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp
index 7a901b296b..c1a48d482f 100644
--- a/src/armnn/WorkingMemHandle.cpp
+++ b/src/armnn/WorkingMemHandle.cpp
@@ -13,8 +13,10 @@ namespace armnn
 namespace experimental
 {
 
-WorkingMemHandle::WorkingMemHandle(std::vector<WorkingMemDescriptor> workingMemDescriptors,
+WorkingMemHandle::WorkingMemHandle(NetworkId networkId,
+                                   std::vector<WorkingMemDescriptor> workingMemDescriptors,
                                    std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap) :
+    m_NetworkId(networkId),
     m_WorkingMemDescriptors(workingMemDescriptors),
     m_WorkingMemDescriptorMap(workingMemDescriptorMap),
     m_IsAllocated(false),
diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp
index 090f180206..cef6fb6fd3 100644
--- a/src/armnn/WorkingMemHandle.hpp
+++ b/src/armnn/WorkingMemHandle.hpp
@@ -24,10 +24,17 @@ class WorkingMemHandle final : public IWorkingMemHandle
 {
 
 public:
-    WorkingMemHandle(std::vector<WorkingMemDescriptor> workingMemDescriptors,
+    WorkingMemHandle(NetworkId networkId,
+                     std::vector<WorkingMemDescriptor> workingMemDescriptors,
                      std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap);
 
-    ~WorkingMemHandle() { FreeWorkingMemory(); }
+    ~WorkingMemHandle()
+    { FreeWorkingMemory(); }
+
+    NetworkId GetNetworkId() override
+    {
+        return m_NetworkId;
+    }
 
     /// Allocate the backing memory required for execution. If this is not called, then allocation will be
     /// deferred to execution time. The mutex must be locked.
@@ -106,6 +113,7 @@ public:
 private:
     void FreeWorkingMemory();
 
+    NetworkId m_NetworkId;
     std::shared_ptr<ProfilerImpl> m_Profiler;
 
     std::vector<WorkingMemDescriptor> m_WorkingMemDescriptors;
diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
index 2ccd2b13af..66ccdbf1d9 100644
--- a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
+++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
@@ -40,15 +40,15 @@ void AsyncEndToEndTestImpl(INetworkPtr network,
     // Creates AsyncNetwork
     NetworkId networkId = 0;
     std::string errorMessage;
-    const INetworkProperties networkProperties;
-    auto asyncNetwork = runtime->CreateAsyncNetwork(networkId, std::move(optNet), errorMessage, networkProperties);
+    const INetworkProperties networkProperties(false, false, true);
+    runtime->LoadNetwork(networkId, std::move(optNet), errorMessage, networkProperties);
 
     InputTensors inputTensors;
     inputTensors.reserve(inputTensorData.size());
     for (auto&& it : inputTensorData)
     {
         inputTensors.push_back({it.first,
-                                ConstTensor(asyncNetwork->GetInputTensorInfo(it.first), it.second.data())});
+                                ConstTensor(runtime->GetInputTensorInfo(networkId, it.first), it.second.data())});
     }
 
     OutputTensors outputTensors;
@@ -59,16 +59,16 @@ void AsyncEndToEndTestImpl(INetworkPtr network,
         std::vector<TOutput> out(it.second.size());
         outputStorage.emplace(it.first, out);
         outputTensors.push_back({it.first,
-                                 Tensor(asyncNetwork->GetOutputTensorInfo(it.first),
+                                 Tensor(runtime->GetOutputTensorInfo(networkId, it.first),
                                         outputStorage.at(it.first).data())});
     }
 
     // Create WorkingMemHandle for this async network
-    std::unique_ptr<IWorkingMemHandle> workingMemHandle = asyncNetwork->CreateWorkingMemHandle();
+    std::unique_ptr<IWorkingMemHandle> workingMemHandle = runtime->CreateWorkingMemHandle(networkId);
     IWorkingMemHandle& workingMemHandleRef = *workingMemHandle.get();
 
     // Run the async network
-    asyncNetwork->Execute(inputTensors, outputTensors, workingMemHandleRef);
+    runtime->Execute(workingMemHandleRef, inputTensors, outputTensors);
 
     // Checks the results.
     for (auto&& it : expectedOutputData)
-- 
cgit v1.2.1