diff options
27 files changed, 1371 insertions, 30 deletions
diff --git a/Android.mk b/Android.mk index aa89ff9292..806d81bcd5 100644 --- a/Android.mk +++ b/Android.mk @@ -108,6 +108,7 @@ LOCAL_SRC_FILES := \ profiling/server/src/timelineDecoder/TimelineCaptureCommandHandler.cpp \ profiling/server/src/timelineDecoder/TimelineDecoder.cpp \ profiling/server/src/timelineDecoder/TimelineDirectoryCaptureCommandHandler.cpp \ + src/armnn/AsyncNetwork.cpp \ src/armnn/BackendHelper.cpp \ src/armnn/BackendRegistry.cpp \ src/armnn/Descriptors.cpp \ @@ -134,6 +135,7 @@ LOCAL_SRC_FILES := \ src/armnn/TypesUtils.cpp \ src/armnn/Utils.cpp \ src/armnn/WallClockTimer.cpp \ + src/armnn/WorkingMemHandle.cpp \ src/armnnUtils/DataLayoutIndexed.cpp \ src/armnnUtils/DotSerializer.cpp \ src/armnnUtils/FloatingPointConverter.cpp \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e75c28da0..62417bebb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -242,12 +242,14 @@ list(APPEND armnn_sources include/armnn/Descriptors.hpp include/armnn/DescriptorsFwd.hpp include/armnn/Exceptions.hpp + include/armnn/IAsyncNetwork.hpp include/armnn/ILayerSupport.hpp include/armnn/ILayerVisitor.hpp include/armnn/INetwork.hpp include/armnn/IProfiler.hpp include/armnn/IRuntime.hpp include/armnn/IStrategy.hpp + include/armnn/IWorkingMemHandle.hpp include/armnn/LayerSupport.hpp include/armnn/LayerVisitorBase.hpp include/armnn/Logging.hpp @@ -406,6 +408,8 @@ list(APPEND armnn_sources src/armnn/layers/TransposeLayer.cpp src/armnn/layers/UnmapLayer.cpp src/armnn/layers/UnmapLayer.hpp + src/armnn/AsyncNetwork.cpp + src/armnn/AsyncNetwork.hpp src/armnn/BackendRegistry.cpp src/armnn/BackendSettings.hpp src/armnn/BackendHelper.cpp @@ -477,6 +481,9 @@ list(APPEND armnn_sources src/armnn/Utils.cpp src/armnn/WallClockTimer.cpp src/armnn/WallClockTimer.hpp + src/armnn/WorkingMemDescriptor.hpp + src/armnn/WorkingMemHandle.cpp + src/armnn/WorkingMemHandle.hpp src/armnn/optimizations/AddBroadcastReshapeLayer.hpp src/armnn/optimizations/AddDebug.hpp src/armnn/optimizations/All.hpp diff --git a/include/armnn/ArmNN.hpp b/include/armnn/ArmNN.hpp index 4b945b91b3..ac4d33f737 100644 --- a/include/armnn/ArmNN.hpp +++ b/include/armnn/ArmNN.hpp @@ -7,6 +7,7 @@ #include "BackendId.hpp" #include "Descriptors.hpp" #include "Exceptions.hpp" +#include "IAsyncNetwork.hpp" #include "INetwork.hpp" #include "IRuntime.hpp" #include "LstmParams.hpp" diff --git a/include/armnn/IAsyncNetwork.hpp b/include/armnn/IAsyncNetwork.hpp new file mode 100644 index 0000000000..7ef83bbff1 --- /dev/null +++ b/include/armnn/IAsyncNetwork.hpp @@ -0,0 +1,51 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <armnn/NetworkFwd.hpp> + +#include "INetwork.hpp" +#include "IProfiler.hpp" +#include "IWorkingMemHandle.hpp" +#include "Tensor.hpp" +#include "Types.hpp" + +#include <mutex> + +namespace armnn +{ + +namespace experimental +{ + +class IAsyncNetwork +{ +public: + virtual ~IAsyncNetwork() {}; + + virtual TensorInfo GetInputTensorInfo(LayerBindingId layerId) const = 0; + virtual TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const = 0; + + /// Thread safe execution of the network. Returns once execution is complete. + /// Will block until this and any other thread using the same workingMem object completes. + virtual Status Execute(const InputTensors& inputTensors, + const OutputTensors& outputTensors, + IWorkingMemHandle& workingMemHandle) = 0; + + /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have + /// overlapped Execution by calling this function from different threads. + virtual std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle() = 0; + + /// Get the profiler used for this network + virtual std::shared_ptr<IProfiler> GetProfiler() const = 0; + + /// Register a debug callback function to be used with this network + virtual void RegisterDebugCallback(const DebugCallbackFunction& func) = 0; +}; + +} // end experimental namespace + +} // end armnn namespace diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp index bceb07405a..2db6d5de83 100644 --- a/include/armnn/INetwork.hpp +++ b/include/armnn/INetwork.hpp @@ -704,6 +704,12 @@ protected: std::unique_ptr<NetworkImpl> pNetworkImpl; }; +namespace experimental +{ +class AsyncNetwork; +class WorkingMemHandle; +} + struct BackendSettings; struct OptimizationResult; class OptimizedNetworkImpl; @@ -723,6 +729,10 @@ public: protected: friend class LoadedNetwork; + + friend class experimental::AsyncNetwork; + friend class experimental::WorkingMemHandle; + friend Graph& GetGraphForTesting(IOptimizedNetwork* optNetPtr); friend ModelOptions& GetModelOptionsForTesting(IOptimizedNetwork* optNetPtr); friend IOptimizedNetworkPtr Optimize(const INetwork& inNetwork, diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index 9122089b62..9f7032914f 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -5,6 +5,7 @@ #pragma once #include "BackendOptions.hpp" +#include "IAsyncNetwork.hpp" #include "INetwork.hpp" #include "IProfiler.hpp" #include "Tensor.hpp" @@ -37,6 +38,8 @@ struct INetworkProperties virtual ~INetworkProperties() {} }; +using namespace armnn::experimental; + class IRuntime { public: @@ -142,6 +145,20 @@ public: std::string& errorMessage, const INetworkProperties& networkProperties); + /// This is an experimental function. + /// Creates an executable network. This network is thread safe allowing for multiple networks to be + /// loaded simultaneously via different threads. + /// Note that the network is never registered with the runtime so does not need to be 'Unloaded'. + /// @param [out] networkIdOut Unique identifier for the network is returned in this reference. + /// @param [in] network Complete network to load into the IRuntime. + /// @param [out] errorMessage Error message if there were any errors. + /// @param [out] networkProperties the INetworkProperties that govern how the network should operate. + /// @return The IAsyncNetwork + std::unique_ptr<IAsyncNetwork> CreateAsyncNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr network, + std::string& errorMessage, + const INetworkProperties& networkProperties); + TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; diff --git a/include/armnn/IWorkingMemHandle.hpp b/include/armnn/IWorkingMemHandle.hpp new file mode 100644 index 0000000000..921b7e1f40 --- /dev/null +++ b/include/armnn/IWorkingMemHandle.hpp @@ -0,0 +1,46 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <mutex> + +namespace armnn +{ + +namespace experimental +{ + +struct WorkingMemDescriptor; + +class IWorkingMemHandle +{ +public: + virtual ~IWorkingMemHandle() {}; + + /// Allocate the backing memory required for execution. If this is not called, then allocation will be + /// deferred to execution time. The mutex must be locked. + virtual void Allocate() = 0; + + /// Free the backing memory required for execution. The mutex must be locked. + virtual void Free() = 0; + + /// IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked. + virtual bool IsAllocated() = 0; + + /// Get a mutex which can be used for synchronizing access to the WorkingMemHandle object. + virtual std::mutex& GetMutex() = 0; + + /// Get the WorkingMemDescriptor for a Layer. The mutex must be locked. + virtual WorkingMemDescriptor& GetWorkingMemDescriptor(LayerGuid id) = 0; + + /// Get the WorkingMemDescriptor at an index. The WorkingMemDescriptors are stored in the same order as + /// the Workloads in a topologically sorted graph. The mutex must be locked. + virtual WorkingMemDescriptor& GetWorkingMemDescriptorAt(unsigned int id) = 0; +}; + +} // end experimental namespace + +} // end armnn namespace diff --git a/include/armnn/NetworkFwd.hpp b/include/armnn/NetworkFwd.hpp index 619839eb64..6c2970f28b 100644 --- a/include/armnn/NetworkFwd.hpp +++ b/include/armnn/NetworkFwd.hpp @@ -6,8 +6,17 @@ namespace armnn { + struct LstmInputParams; struct QuantizedLstmInputParams; + +namespace experimental +{ + +class IAsyncNetwork; + +} // end experimental namespace + class INetwork; class IOptimizedNetwork; class Graph; @@ -15,4 +24,5 @@ class IInputSlot; class IOutputSlot; class IConnectableLayer; class IDataLayer; -} + +} // end armnn namespace diff --git a/include/armnn/backends/IWorkload.hpp b/include/armnn/backends/IWorkload.hpp index 0bd8d2db75..a4827ebcdf 100644 --- a/include/armnn/backends/IWorkload.hpp +++ b/include/armnn/backends/IWorkload.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2020 Arm Ltd. All rights reserved. +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -9,6 +9,15 @@ namespace armnn { +namespace experimental +{ + +struct WorkingMemDescriptor; + +} // end experimental namespace + +using namespace armnn::experimental; + /// Workload interface to enqueue a layer computation. class IWorkload { public: @@ -18,9 +27,11 @@ public: virtual void Execute() const = 0; + virtual void ExecuteAsync(WorkingMemDescriptor& desc) = 0; + virtual profiling::ProfilingGuid GetGuid() const = 0; - virtual void RegisterDebugCallback(const DebugCallbackFunction & /*func*/) {} + virtual void RegisterDebugCallback(const DebugCallbackFunction& /*func*/) {} }; } //namespace armnn diff --git a/src/armnn/AsyncNetwork.cpp b/src/armnn/AsyncNetwork.cpp new file mode 100644 index 0000000000..4698bcf399 --- /dev/null +++ b/src/armnn/AsyncNetwork.cpp @@ -0,0 +1,611 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "AsyncNetwork.hpp" +#include "Graph.hpp" +#include "Layer.hpp" +#include "Profiling.hpp" + +#include <armnn/BackendRegistry.hpp> +#include <armnn/Logging.hpp> +#include <armnn/utility/Assert.hpp> + +#include <armnn/backends/IMemoryManager.hpp> +#include <backendsCommon/CpuTensorHandle.hpp> +#include <backendsCommon/WorkloadData.hpp> +#include <backendsCommon/MemCopyWorkload.hpp> +#include <LabelsAndEventClasses.hpp> + +#include <fmt/format.h> + +namespace armnn +{ + +namespace experimental +{ + +void AddLayerStructure(std::unique_ptr<profiling::TimelineUtilityMethods>& timelineUtils, + const Layer& layer, + profiling::ProfilingGuid networkGuid) +{ + // Add layer to the post-optimisation network structure + std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr(); + timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(), + networkGuid, + layerName, + profiling::LabelsAndEventClasses::LAYER_GUID); + for (auto&& input : layer.GetInputSlots()) + { + const IOutputSlot* source = input.GetConnectedOutputSlot(); + ARMNN_ASSERT(source != NULL); + timelineUtils->CreateConnectionRelationship(profiling::ProfilingRelationshipType::RetentionLink, + source->GetOwningLayerGuid(), + layer.GetGuid()); + } +} + +void AddWorkloadStructure(std::unique_ptr<profiling::TimelineUtilityMethods>& timelineUtils, + std::unique_ptr<IWorkload>& workload, + const Layer& layer) +{ + // Add workload to the post-optimisation network structure + timelineUtils->CreateTypedEntity(workload->GetGuid(), profiling::LabelsAndEventClasses::WORKLOAD_GUID); + timelineUtils->MarkEntityWithLabel(workload->GetGuid(), + layer.GetBackendId().Get(), + profiling::LabelsAndEventClasses::BACKENDID_GUID); + + // Link the workload to the layer + timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink, + layer.GetGuid(), + workload->GetGuid(), + profiling::LabelsAndEventClasses::CHILD_GUID); +} + +TensorInfo AsyncNetwork::GetInputTensorInfo(LayerBindingId layerId) const +{ + for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers()) + { + ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot"); + if (inputLayer->GetBindingId() == layerId) + { + return inputLayer->GetOutputSlot(0).GetTensorInfo(); + } + } + + throw InvalidArgumentException(fmt::format("No input layer is associated with id {0}}", layerId)); +} + +TensorInfo AsyncNetwork::GetOutputTensorInfo(LayerBindingId layerId) const +{ + for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers()) + { + ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot"); + ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected"); + if (outputLayer->GetBindingId() == layerId) + { + return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo(); + } + } + + throw InvalidArgumentException(fmt::format("No output layer is associated with id {0}}", layerId)); +} + +// Need something like the collectors to get the correct tensors for the inputs +void AsyncNetwork::CollectInputTensorHandles( + std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles, + std::vector<ITensorHandle*>& inputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged) +{ + for (auto&& inputSlot : layer->GetInputSlots()) + { + // The graph must be well-formed at this point. + ARMNN_ASSERT(inputSlot.GetConnection()); + auto outputSlot = inputSlot.GetConnectedOutputSlot(); + auto key = outputSlot->GetOwningLayer().GetGuid(); + auto search = tensorHandles.find(key); + + if (search == tensorHandles.end()) + { + ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = outputSlot->GetTensorInfo(); + + ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); + std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + ITensorHandle* tensorPtr = tensor.release(); + inputs.push_back(tensorPtr); + } + else + { + unsigned int index = outputSlot->CalculateIndexOnOwner(); + inputs.push_back(search->second[index]); + } + } +} + +void AsyncNetwork::CreateOutputTensorHandles( + std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles, + std::vector<ITensorHandle*>& outputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged) +{ + auto guid = layer->GetGuid(); + std::vector<ITensorHandle*> tensorHandleVectors; + tensorHandleVectors.reserve(layer->GetNumOutputSlots()); + + for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++) + { + const OutputSlot& slot = layer->GetOutputSlot(idx); + ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = slot.GetTensorInfo(); + + ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); + std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + ITensorHandle* tensorPtr = tensor.release(); + outputs.push_back(tensorPtr); + tensorHandleVectors.push_back(tensorPtr); + } + tensorHandles.insert({guid, tensorHandleVectors}); +} + +const IWorkloadFactory& AsyncNetwork::GetWorkloadFactory(const Layer& layer) const +{ + const IWorkloadFactory* workloadFactory = nullptr; + + auto it = m_WorkloadFactories.find(layer.GetBackendId()); + if (it == m_WorkloadFactories.end()) + { + throw RuntimeException( + fmt::format("No workload factory for {0} to be used for layer: {1}}", + layer.GetBackendId().Get(), + layer.GetNameStr()), + CHECK_LOCATION()); + } + + workloadFactory = it->second.first.get(); + + ARMNN_ASSERT_MSG(workloadFactory, "No workload factory"); + + std::string reasonIfUnsupported; + ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, {}, reasonIfUnsupported), + "Factory does not support layer"); + IgnoreUnused(reasonIfUnsupported); + return *workloadFactory; +} + +void AsyncNetwork::EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& context) +{ + if (layer.GetType() != LayerType::Input) + { + throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer"); + } + LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid(); + WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id); + ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output"); + + MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags(); + if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor + { + if (CheckFlag(importFlags, MemorySource::Malloc) ) + { + // This assumes a CPU Tensor handle + std::unique_ptr<ITensorHandle> tensorHandle = + std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(), + inputTensor.GetMemoryArea()); + + void* mem = tensorHandle->Map(false); + if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc)) + { + tensorHandle->Unmap(); + return; + } + tensorHandle->Unmap(); + throw MemoryImportException("EnqueueInput: Memory Import failed"); + } + else + { + throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import"); + } + } + else + { + std::unique_ptr<ITensorHandle> tensorHandle = + std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea()); + + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; + + for (const auto& input : descriptor.m_Inputs) + { + CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc); + } + } +} + +void AsyncNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle) +{ + if (layer.GetType() != LayerType::Output) + { + throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer"); + } + ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); + + LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid(); + WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id); + + ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0]; + ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated."); + + // Try import the output tensor. + // Note: We can only import the output pointer if all of the following hold true: + // a) The imported pointer is aligned sufficiently + // b) The tensor has zero padding + // c) There is only one connection to the OutputSlot and it is to an OutputLayer. + // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) + // e) m_IsExportEnabled must be set to true + if (m_NetworkProperties.m_ExportEnabled && + (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) + { + if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) + { + MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); + if (CheckFlag(importFlags, MemorySource::Malloc)) + { + std::unique_ptr<ITensorHandle> tensorHandle = + std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), + outputTensor.GetMemoryArea()); + + void* mem = tensorHandle->Map(false); + bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc); + tensorHandle->Unmap(); + + if (importOk) + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute"); + descriptor.m_Inputs[0]->Map(true); + descriptor.m_Inputs[0]->Unmap(); + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed"); + } + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export"); + } + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer"); + } + } + else + { + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; + + std::unique_ptr<ITensorHandle> tensorHandle = + std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); + + CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc); + } +} + +AsyncNetwork::AsyncNetwork(std::unique_ptr<IOptimizedNetwork> net, + const INetworkProperties& networkProperties, + profiling::ProfilingService& profilingService) : + m_OptimizedNetwork(std::move(net)), + m_NetworkProperties(networkProperties), + m_ProfilingService(profilingService) +{ + // Create a profiler and register it for the current thread. + m_Profiler = std::make_shared<IProfiler>(); + ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get()); + + Graph &order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); + + //First create tensor handlers, backends and workload factories. + //Handlers are created before workloads are. + //Because workload creation can modify some of the handlers, + //(for example the splitter and concat layers). + for (auto &&layer : order) + { + auto const &backendId = layer->GetBackendId(); + if (m_Backends.count(backendId) == 0) + { + auto createBackend = BackendRegistryInstance().GetFactory(backendId); + auto it = m_Backends.emplace(std::make_pair(backendId, createBackend())); + + IBackendInternal* backend = it.first->second.get(); + + if (backend->SupportsTensorAllocatorAPI()) + { + backend->RegisterTensorHandleFactories(m_TensorHandleFactoryRegistry); + + auto workloadFactory = backend->CreateWorkloadFactory(m_TensorHandleFactoryRegistry); + m_WorkloadFactories.emplace( + std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr))); + } + else + { + IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager(); + auto workloadFactory = backend->CreateWorkloadFactory(memoryManager); + + m_WorkloadFactories.emplace( + std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager))); + } + } + } + + profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); + std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils = + profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService); + if (timelineUtils) + { + timelineUtils->CreateTypedEntity(networkGuid, profiling::LabelsAndEventClasses::NETWORK_GUID); + } + + //Then create workloads. + for (auto &&layer : order) + { + if (timelineUtils) + { + // Add layer to the post-optimisation network structure + AddLayerStructure(timelineUtils, *layer, networkGuid); + } + + const IWorkloadFactory &workloadFactory = GetWorkloadFactory(*layer); + + switch (layer->GetType()) + { + case LayerType::Input: + case LayerType::Output: + { + // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput(). + break; + } + default: + { + auto workload = layer->CreateWorkload(workloadFactory); + + if (!workload) + { + const char* const layerName = + layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>"; + throw InvalidArgumentException( + fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')", + layerName, + static_cast<int>(layer->GetType()), + layer->GetBackendId().Get() + )); + } + + if (timelineUtils) + { + // Add workload to the post-optimisation network structure + AddWorkloadStructure(timelineUtils, workload, *layer); + } + + m_WorkloadQueue.push_back(move(workload)); + // release the constant data in the layer.. + layer->ReleaseConstantData(); + break; + } + } + } + + if (timelineUtils) + { + // Commit to send the post-optimisation network structure + timelineUtils->Commit(); + } + + // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload. + // PostAllocationConfiguure will now need to be handled in the ExecuteOn(WorkingMemDescriptor) + for (auto &workload : m_WorkloadQueue) + { + workload->PostAllocationConfigure(); + } +} + +Status AsyncNetwork::Execute(const InputTensors& inputTensors, + const OutputTensors& outputTensors, + IWorkingMemHandle& iWorkingMemHandle) +{ + const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + + // Walk graph to determine the order of execution. + if (graph.GetNumLayers() < 2) + { + ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph"; + return Status::Failure; + } + + if (graph.GetNumInputs() != inputTensors.size()) + { + throw InvalidArgumentException("Number of inputs provided does not match network."); + } + + std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils = + profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService); + profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid(); + if (timelineUtils) + { + // Add inference timeline trace if profiling is enabled. + profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); + timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID); + timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink, + networkGuid, + inferenceGuid, + profiling::LabelsAndEventClasses::EXECUTION_OF_GUID); + timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS); + } + + bool executionSucceeded = true; + + if (timelineUtils) + { + // Add end of life of the inference timeline if profiling is enabled. + timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS); + timelineUtils->Commit(); + } + WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle); + std::lock_guard<std::mutex> lockGuard(workingMemHandle.GetMutex()); + + if (!workingMemHandle.IsAllocated()) + { + workingMemHandle.Allocate(); + } + + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); + unsigned int i = 0; + + for (const BindableLayer* inputLayer : graph.GetInputLayers()) + { + EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle); + ++i; + } + } + + auto Fail = [&](const std::exception& error) + { + ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what(); + executionSucceeded = false; + }; + profiling::ProfilingDynamicGuid workloadInferenceID(0); + + try + { + for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i) + { + auto& workload = m_WorkloadQueue[i]; + if (timelineUtils) + { + workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), + inferenceGuid); + } + workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i)); + + if (timelineUtils) + { + timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); + } + } + } + catch (const RuntimeException& error) + { + Fail(error); + } + catch (const std::runtime_error& error) + { + Fail(error); + } + // For each output to the network, call EnqueueOutput with the data passed by the user. + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); + unsigned int i = static_cast<unsigned int>(m_WorkloadQueue.size() - graph.GetNumOutputs()); + + for (const BindableLayer* outputLayer : graph.GetOutputLayers()) + { + EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle); + ++i; + } + } + return executionSucceeded ? Status::Success : Status::Failure; +} + +/// Get the profiler used for this network +std::shared_ptr<IProfiler> AsyncNetwork::GetProfiler() const +{ + return m_Profiler; +} + +void AsyncNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) +{ + for (auto&& workloadPtr: m_WorkloadQueue) + { + workloadPtr.get()->RegisterDebugCallback(func); + } +} + +/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have +/// overlapped Execution by calling this function from different threads. +std::unique_ptr<IWorkingMemHandle> AsyncNetwork::CreateWorkingMemHandle() +{ + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + std::unordered_map<LayerGuid, std::vector<ITensorHandle*> > tensorHandles; + std::vector<WorkingMemDescriptor> workingMemDescriptors; + std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap; + + for (auto&& layer : order) + { + if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output) + { + continue; + } + WorkingMemDescriptor workingMemDescriptor; + // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer + // If Export is enabled disable memory management so we can export, otherwise we do a copy + if((layer->GetNumOutputSlots() == 1) && + (layer->GetOutputSlots()[0].GetNumConnections() == 1) && + (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) + { + CollectInputTensorHandles(tensorHandles, + workingMemDescriptor.m_Inputs, + layer, + m_TensorHandleFactoryRegistry, + !m_NetworkProperties.m_ExportEnabled); + CreateOutputTensorHandles(tensorHandles, + workingMemDescriptor.m_Outputs, + layer, + m_TensorHandleFactoryRegistry, + !m_NetworkProperties.m_ExportEnabled); + } + else + { + CollectInputTensorHandles(tensorHandles, + workingMemDescriptor.m_Inputs, + layer, + m_TensorHandleFactoryRegistry); + CreateOutputTensorHandles(tensorHandles, + workingMemDescriptor.m_Outputs, + layer, + m_TensorHandleFactoryRegistry); + } + workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor}); + workingMemDescriptors.push_back(workingMemDescriptor); + } + return std::make_unique<WorkingMemHandle>(workingMemDescriptors, workingMemDescriptorMap); +} + +void AsyncNetwork::FreeWorkingMemory() +{ + // Informs the memory managers to release memory in it's respective memory group + for (auto&& workloadFactory : m_WorkloadFactories) + { + IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second; + if (memoryManager) + { + memoryManager->Release(); + } + } + m_TensorHandleFactoryRegistry.ReleaseMemory(); +} + +} // end experimental namespace + +} // end armnn namespace diff --git a/src/armnn/AsyncNetwork.hpp b/src/armnn/AsyncNetwork.hpp new file mode 100644 index 0000000000..9c525c5472 --- /dev/null +++ b/src/armnn/AsyncNetwork.hpp @@ -0,0 +1,106 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <armnn/IAsyncNetwork.hpp> +#include <armnn/Tensor.hpp> +#include <armnn/Types.hpp> + +#include "LayerFwd.hpp" +#include "Network.hpp" +#include "Profiling.hpp" +#include "WorkingMemHandle.hpp" + +#include <armnn/backends/IBackendInternal.hpp> +#include <backendsCommon/TensorHandleFactoryRegistry.hpp> +#include <backendsCommon/Workload.hpp> +#include <backendsCommon/WorkloadFactory.hpp> +#include <ProfilingService.hpp> +#include <TimelineUtilityMethods.hpp> + +#include <unordered_map> + +namespace armnn +{ + +namespace experimental +{ + +class AsyncNetwork final : public IAsyncNetwork +{ +public: + using WorkloadQueue = std::vector<std::unique_ptr<IWorkload>>; + + AsyncNetwork(std::unique_ptr<IOptimizedNetwork> net, + const INetworkProperties &networkProperties, + profiling::ProfilingService &profilingService); + + ~AsyncNetwork() { FreeWorkingMemory(); } + + TensorInfo GetInputTensorInfo(LayerBindingId layerId) const override; + TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const override; + + /// Thread safe execution of the network. Returns once execution is complete. + /// Will block until this and any other thread using the same workingMem object completes. + virtual Status Execute(const InputTensors& inputTensors, + const OutputTensors& outputTensors, + IWorkingMemHandle& workingMemHandle) override; + + /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have + /// overlapped Execution by calling this function from different threads. + std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle() override; + + /// Get the profiler used for this network + std::shared_ptr<IProfiler> GetProfiler() const override; + + /// Register a debug callback function to be used with this network + void RegisterDebugCallback(const DebugCallbackFunction& func) override; + +private: + void FreeWorkingMemory(); + + void CollectInputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles, + std::vector<ITensorHandle*>& inputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged = false); + + void CreateOutputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles, + std::vector<ITensorHandle*>& outputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged = false); + + void EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& handle); + + void EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle); + + using BackendPtrMap = std::unordered_map<BackendId, IBackendInternalUniquePtr>; + + using WorkloadFactoryWithMemoryManager = + std::pair<IBackendInternal::IWorkloadFactoryPtr, IBackendInternal::IMemoryManagerSharedPtr>; + + using WorkloadFactoryMap = std::unordered_map<BackendId, WorkloadFactoryWithMemoryManager>; + + const IWorkloadFactory& GetWorkloadFactory(const Layer& layer) const; + + BackendPtrMap m_Backends; + WorkloadFactoryMap m_WorkloadFactories; + + std::unique_ptr<IOptimizedNetwork> m_OptimizedNetwork; + INetworkProperties m_NetworkProperties; + WorkloadQueue m_WorkloadQueue; + std::shared_ptr<IProfiler> m_Profiler; + + TensorHandleFactoryRegistry m_TensorHandleFactoryRegistry; + + /// Profiling Service Instance + profiling::ProfilingService& m_ProfilingService; +}; + +} // end experimental namespace + +} // end armnn namespace diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index 9cc7b2cb81..5dc1ef9cc5 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -64,6 +64,14 @@ Status IRuntime::LoadNetwork(NetworkId& networkIdOut, return pRuntimeImpl->LoadNetwork(networkIdOut, std::move(network), errorMessage, networkProperties); } +std::unique_ptr<IAsyncNetwork> IRuntime::CreateAsyncNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr network, + std::string& errorMessage, + const INetworkProperties& networkProperties) +{ + return pRuntimeImpl->CreateAsyncNetwork(networkIdOut, std::move(network), errorMessage, networkProperties); +} + TensorInfo IRuntime::GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const { return pRuntimeImpl->GetInputTensorInfo(networkId, layerId); @@ -165,6 +173,43 @@ Status RuntimeImpl::LoadNetwork(NetworkId& networkIdOut, return Status::Success; } +std::unique_ptr<IAsyncNetwork> RuntimeImpl::CreateAsyncNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr network, + std::string&, + const INetworkProperties& networkProperties) +{ + IOptimizedNetwork* rawNetwork = network.release(); + + networkIdOut = GenerateNetworkId(); + + for (auto&& context : m_BackendContexts) + { + context.second->BeforeLoadNetwork(networkIdOut); + } + + unique_ptr<AsyncNetwork> asyncNetwork = std::make_unique<AsyncNetwork>( + std::unique_ptr<IOptimizedNetwork>(rawNetwork), + networkProperties, + m_ProfilingService); + + if (!asyncNetwork) + { + return nullptr; + } + + for (auto&& context : m_BackendContexts) + { + context.second->AfterLoadNetwork(networkIdOut); + } + + if (m_ProfilingService.IsProfilingEnabled()) + { + m_ProfilingService.IncrementCounterValue(armnn::profiling::NETWORK_LOADS); + } + + return asyncNetwork; +} + Status RuntimeImpl::UnloadNetwork(NetworkId networkId) { bool unloadOk = true; diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp index 2c7e07f9fb..150012eb61 100644 --- a/src/armnn/Runtime.hpp +++ b/src/armnn/Runtime.hpp @@ -4,6 +4,7 @@ // #pragma once +#include "AsyncNetwork.hpp" #include "LoadedNetwork.hpp" #include "DeviceSpec.hpp" @@ -55,6 +56,13 @@ public: TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; + // Create Aysnchronous Network from the IOptimizedNetowrkPtr + std::unique_ptr<IAsyncNetwork> CreateAsyncNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr network, + std::string& errorMessage, + const INetworkProperties& networkProperties); + + // Evaluates network using input in inputTensors, outputs filled into outputTensors. Status EnqueueWorkload(NetworkId networkId, const InputTensors& inputTensors, diff --git a/src/armnn/WorkingMemDescriptor.hpp b/src/armnn/WorkingMemDescriptor.hpp new file mode 100644 index 0000000000..688082e77b --- /dev/null +++ b/src/armnn/WorkingMemDescriptor.hpp @@ -0,0 +1,29 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <armnn/backends/ITensorHandle.hpp> + +#include <vector> + +namespace armnn +{ + +namespace experimental +{ + +struct WorkingMemDescriptor +{ + std::vector<ITensorHandle*> m_Inputs; + std::vector<ITensorHandle*> m_Outputs; + + ~WorkingMemDescriptor() = default; + WorkingMemDescriptor() = default; +}; + +} // end experimental namespace + +} // end armnn namespace diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp new file mode 100644 index 0000000000..7a901b296b --- /dev/null +++ b/src/armnn/WorkingMemHandle.cpp @@ -0,0 +1,49 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "backendsCommon/CpuTensorHandle.hpp" +#include "WorkingMemHandle.hpp" +#include "Network.hpp" + +namespace armnn +{ + +namespace experimental +{ + +WorkingMemHandle::WorkingMemHandle(std::vector<WorkingMemDescriptor> workingMemDescriptors, + std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap) : + m_WorkingMemDescriptors(workingMemDescriptors), + m_WorkingMemDescriptorMap(workingMemDescriptorMap), + m_IsAllocated(false), + m_Mutex() +{} + +void WorkingMemHandle::FreeWorkingMemory() +{ + for (auto workingMemDescriptor : m_WorkingMemDescriptors) + { + for (auto input : workingMemDescriptor.m_Inputs) + { + if (input) + { + delete input; + input = nullptr; + } + } + for (auto output : workingMemDescriptor.m_Outputs) + { + if (output) + { + delete output; + output = nullptr; + } + } + } +} + +} // end experimental namespace + +} // end armnn namespace diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp new file mode 100644 index 0000000000..090f180206 --- /dev/null +++ b/src/armnn/WorkingMemHandle.hpp @@ -0,0 +1,119 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "Layer.hpp" +#include "Network.hpp" +#include "WorkingMemDescriptor.hpp" + +#include <armnn/IWorkingMemHandle.hpp> +#include <armnn/Tensor.hpp> + +#include <unordered_map> + +namespace armnn +{ + +namespace experimental +{ + +class WorkingMemHandle final : public IWorkingMemHandle +{ + +public: + WorkingMemHandle(std::vector<WorkingMemDescriptor> workingMemDescriptors, + std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap); + + ~WorkingMemHandle() { FreeWorkingMemory(); } + + /// Allocate the backing memory required for execution. If this is not called, then allocation will be + /// deferred to execution time. The mutex must be locked. + void Allocate() override + { + if (m_IsAllocated) + { + return; + } + m_IsAllocated = true; + + // Iterate through all WorkingMemDescriptors calling allocate() on each input and output in turn + for (auto workingMemDescriptor : m_WorkingMemDescriptors) + { + for (auto& input : workingMemDescriptor.m_Inputs) + { + input->Allocate(); + } + for (auto& output : workingMemDescriptor.m_Outputs) + { + output->Allocate(); + } + } + } + + /// Free the backing memory required for execution. The mutex must be locked. + void Free() override + { + if (!m_IsAllocated) + { + return; + } + m_IsAllocated = false; + + // Iterate through all WorkingMemDescriptors calling free() on each input and output in turn + for (auto workingMemDescriptor : m_WorkingMemDescriptors) + { + for (auto& input : workingMemDescriptor.m_Inputs) + { + input->Unmap(); + } + for (auto& output : workingMemDescriptor.m_Outputs) + { + output->Unmap(); + } + } + } + + /// IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked. + bool IsAllocated() override + { + return m_IsAllocated; + } + + /// Get a mutex which can be used for synchronizing access to the WorkingMemHandle object. + std::mutex& GetMutex() override + { + return m_Mutex; + } + + /// Get the WorkingMemDescriptor for a Layer. The mutex must be locked. + WorkingMemDescriptor& GetWorkingMemDescriptor(LayerGuid id) override + { + auto result = m_WorkingMemDescriptorMap.find(id); + ARMNN_ASSERT(result != m_WorkingMemDescriptorMap.end()); + return result->second; + } + + /// Get the WorkingMemDescriptor at an index. The WorkingMemDescriptors are stored in the same order as + /// the Workloads in a topologically sorted graph. The mutex must be locked. + WorkingMemDescriptor& GetWorkingMemDescriptorAt(unsigned int id) override + { + return m_WorkingMemDescriptors[id]; + } + +private: + void FreeWorkingMemory(); + + std::shared_ptr<ProfilerImpl> m_Profiler; + + std::vector<WorkingMemDescriptor> m_WorkingMemDescriptors; + std::unordered_map<LayerGuid, WorkingMemDescriptor> m_WorkingMemDescriptorMap; + bool m_IsAllocated; + std::mutex m_Mutex; +}; + +} // end experimental namespace + +} // end armnn namespace diff --git a/src/backends/backendsCommon/MemCopyWorkload.cpp b/src/backends/backendsCommon/MemCopyWorkload.cpp index 7bdc05e4a2..813adefed7 100644 --- a/src/backends/backendsCommon/MemCopyWorkload.cpp +++ b/src/backends/backendsCommon/MemCopyWorkload.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -40,7 +40,7 @@ void GatherTensorHandlePairs(const MemCopyQueueDescriptor& descriptor, CopyMemGenericWorkload::CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) + const WorkloadInfo& info) : BaseWorkload<MemCopyQueueDescriptor>(descriptor, info) { GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); @@ -61,4 +61,21 @@ void CopyMemGenericWorkload::Execute() const } } +void CopyMemGenericWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor) +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyMemGeneric_Execute_WorkingMemDescriptor"); + std::vector<TensorHandlePair> tensorHandlePairs; + GatherTensorHandlePairs(descriptor, tensorHandlePairs); + + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; + + for (const auto& pair : tensorHandlePairs) + { + CopyTensorContentsGeneric(pair.first, pair.second, copyFunc); + } +} + } //namespace armnn diff --git a/src/backends/backendsCommon/MemCopyWorkload.hpp b/src/backends/backendsCommon/MemCopyWorkload.hpp index 65292861fb..12664fd527 100644 --- a/src/backends/backendsCommon/MemCopyWorkload.hpp +++ b/src/backends/backendsCommon/MemCopyWorkload.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -19,6 +19,7 @@ class CopyMemGenericWorkload : public BaseWorkload<MemCopyQueueDescriptor> public: CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; + void ExecuteAsync(WorkingMemDescriptor& descriptor) override; private: using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>; diff --git a/src/backends/backendsCommon/MemSyncWorkload.cpp b/src/backends/backendsCommon/MemSyncWorkload.cpp index b29c46e918..fe04a3024b 100644 --- a/src/backends/backendsCommon/MemSyncWorkload.cpp +++ b/src/backends/backendsCommon/MemSyncWorkload.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -14,7 +14,7 @@ namespace armnn { SyncMemGenericWorkload::SyncMemGenericWorkload(const MemSyncQueueDescriptor& descriptor, - const WorkloadInfo& info) + const WorkloadInfo& info) : BaseWorkload<MemSyncQueueDescriptor>(descriptor, info) { m_TensorHandle = descriptor.m_Inputs[0]; @@ -27,4 +27,11 @@ void SyncMemGenericWorkload::Execute() const m_TensorHandle->Unmap(); } +void SyncMemGenericWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor) +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute_WorkingMemDescriptor"); + descriptor.m_Inputs[0]->Map(true); + descriptor.m_Inputs[0]->Unmap(); +} + } //namespace armnn diff --git a/src/backends/backendsCommon/MemSyncWorkload.hpp b/src/backends/backendsCommon/MemSyncWorkload.hpp index 0d44788c70..8142f180a6 100644 --- a/src/backends/backendsCommon/MemSyncWorkload.hpp +++ b/src/backends/backendsCommon/MemSyncWorkload.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -19,6 +19,7 @@ class SyncMemGenericWorkload : public BaseWorkload<MemSyncQueueDescriptor> public: SyncMemGenericWorkload(const MemSyncQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; + void ExecuteAsync(WorkingMemDescriptor& descriptor) override; private: ITensorHandle* m_TensorHandle; diff --git a/src/backends/backendsCommon/Workload.hpp b/src/backends/backendsCommon/Workload.hpp index 482f9bd26d..940b878d2f 100644 --- a/src/backends/backendsCommon/Workload.hpp +++ b/src/backends/backendsCommon/Workload.hpp @@ -1,11 +1,12 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once #include "WorkloadData.hpp" #include "WorkloadInfo.hpp" +#include "WorkingMemDescriptor.hpp" #include <armnn/backends/IWorkload.hpp> #include <Profiling.hpp> @@ -36,6 +37,8 @@ public: m_Data.Validate(info); } + void ExecuteAsync(WorkingMemDescriptor&) override {}; + void PostAllocationConfigure() override {} const QueueDescriptor& GetData() const { return m_Data; } diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt index d3857b8357..9d36f52b59 100644 --- a/src/backends/backendsCommon/test/CMakeLists.txt +++ b/src/backends/backendsCommon/test/CMakeLists.txt @@ -51,6 +51,7 @@ list(APPEND armnnBackendsCommonUnitTests_sources SpaceToDepthEndToEndTestImpl.cpp SpaceToDepthEndToEndTestImpl.hpp SplitterEndToEndTestImpl.hpp + StridedSliceAsyncEndToEndTest.hpp TensorCopyUtils.cpp TensorCopyUtils.hpp WorkloadFactoryHelper.hpp diff --git a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp index 9ce42019f0..3a757d0c59 100644 --- a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp +++ b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp @@ -4,6 +4,8 @@ // #pragma once +#include "CommonTestUtils.hpp" + #include <armnn/Descriptors.hpp> #include <armnn/INetwork.hpp> #include <armnn/IRuntime.hpp> @@ -105,23 +107,6 @@ inline bool ConstantUsageUint8Test(const std::vector<BackendId>& backends) ); } -// Utility template for comparing tensor elements -template<DataType ArmnnType, typename T = ResolveType<ArmnnType>> -bool Compare(T a, T b, float tolerance = 0.000001f) -{ - if (ArmnnType == DataType::Boolean) - { - // NOTE: Boolean is represented as uint8_t (with zero equals - // false and everything else equals true), therefore values - // need to be casted to bool before comparing them - return static_cast<bool>(a) == static_cast<bool>(b); - } - - // NOTE: All other types can be cast to float and compared with - // a certain level of tolerance - return std::fabs(static_cast<float>(a) - static_cast<float>(b)) <= tolerance; -} - // Utility function to find the number of instances of a substring within a string. int SubStringCounter(std::string& string, std::string&& substring) { diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp new file mode 100644 index 0000000000..2ccd2b13af --- /dev/null +++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp @@ -0,0 +1,178 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <ResolveType.hpp> + +#include <armnn/IWorkingMemHandle.hpp> +#include <armnn/INetwork.hpp> + +#include <backendsCommon/test/CommonTestUtils.hpp> + +#include <boost/test/unit_test.hpp> + +#include <vector> + +namespace armnn +{ + +namespace experimental +{ + +template<DataType ArmnnIType, DataType ArmnnOType, + typename TInput = ResolveType <ArmnnIType>, typename TOutput = ResolveType <ArmnnOType>> +void AsyncEndToEndTestImpl(INetworkPtr network, + const std::map<int, std::vector<TInput>>& inputTensorData, + const std::map<int, std::vector<TOutput>>& expectedOutputData, + std::vector<BackendId> backends, + float tolerance = 0.000001f) +{ + // Create Runtime in which test will run + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Optimize the Network + IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec()); + + // Creates AsyncNetwork + NetworkId networkId = 0; + std::string errorMessage; + const INetworkProperties networkProperties; + auto asyncNetwork = runtime->CreateAsyncNetwork(networkId, std::move(optNet), errorMessage, networkProperties); + + InputTensors inputTensors; + inputTensors.reserve(inputTensorData.size()); + for (auto&& it : inputTensorData) + { + inputTensors.push_back({it.first, + ConstTensor(asyncNetwork->GetInputTensorInfo(it.first), it.second.data())}); + } + + OutputTensors outputTensors; + outputTensors.reserve(expectedOutputData.size()); + std::map<int, std::vector<TOutput>> outputStorage; + for (auto&& it : expectedOutputData) + { + std::vector<TOutput> out(it.second.size()); + outputStorage.emplace(it.first, out); + outputTensors.push_back({it.first, + Tensor(asyncNetwork->GetOutputTensorInfo(it.first), + outputStorage.at(it.first).data())}); + } + + // Create WorkingMemHandle for this async network + std::unique_ptr<IWorkingMemHandle> workingMemHandle = asyncNetwork->CreateWorkingMemHandle(); + IWorkingMemHandle& workingMemHandleRef = *workingMemHandle.get(); + + // Run the async network + asyncNetwork->Execute(inputTensors, outputTensors, workingMemHandleRef); + + // Checks the results. + for (auto&& it : expectedOutputData) + { + std::vector<TOutput> out = outputStorage.at(it.first); + for (unsigned int i = 0; i < out.size(); ++i) + { + BOOST_CHECK(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true); + } + } +} + +template<typename armnn::DataType DataType> +INetworkPtr CreateStridedSliceNetwork(const TensorShape& inputShape, + const TensorShape& outputShape, + const std::vector<int>& beginData, + const std::vector<int>& endData, + const std::vector<int>& stridesData, + int beginMask = 0, + int endMask = 0, + int shrinkAxisMask = 0, + int ellipsisMask = 0, + int newAxisMask = 0, + const float qScale = 1.0f, + const int32_t qOffset = 0) +{ + using namespace armnn; + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + TensorInfo inputTensorInfo(inputShape, DataType, qScale, qOffset); + TensorInfo outputTensorInfo(outputShape, DataType, qScale, qOffset); + + armnn::StridedSliceDescriptor stridedSliceDescriptor; + stridedSliceDescriptor.m_Begin = beginData; + stridedSliceDescriptor.m_End = endData; + stridedSliceDescriptor.m_Stride = stridesData; + stridedSliceDescriptor.m_BeginMask = beginMask; + stridedSliceDescriptor.m_EndMask = endMask; + stridedSliceDescriptor.m_ShrinkAxisMask = shrinkAxisMask; + stridedSliceDescriptor.m_EllipsisMask = ellipsisMask; + stridedSliceDescriptor.m_NewAxisMask = newAxisMask; + + IConnectableLayer* input = net->AddInputLayer(0, "Input_Layer"); + IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(stridedSliceDescriptor, "splitter"); + IConnectableLayer* output = net->AddOutputLayer(0); + + Connect(input, stridedSlice, inputTensorInfo, 0, 0); + Connect(stridedSlice, output, outputTensorInfo, 0, 0); + + return net; +} + +template<armnn::DataType ArmnnType> +void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends) +{ + using namespace armnn; + using T = ResolveType<ArmnnType>; + + const TensorShape& inputShape = {3, 2, 3, 1}; + const TensorShape& outputShape = {1, 2, 3, 1}; + const std::vector<int>& beginData = {1, 0, 0, 0}; + const std::vector<int>& endData = {2, 2, 3, 1}; + const std::vector<int>& stridesData = {1, 1, 1, 1}; + int beginMask = 0; + int endMask = 0; + int shrinkAxisMask = 0; + int ellipsisMask = 0; + int newAxisMask = 0; + + // Builds up the structure of the network + INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape, + outputShape, + beginData, + endData, + stridesData, + beginMask, + endMask, + shrinkAxisMask, + ellipsisMask, + newAxisMask); + + BOOST_TEST_CHECKPOINT("create a network"); + + // Creates structures for input & output. + std::vector<T> inputData{ + 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, + + 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f, + + 5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f + }; + + std::vector<T> outputExpected{ + 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f + }; + + std::map<int, std::vector<T>> inputTensorData = {{0, inputData}}; + std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}}; + + AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends); +} + +} // experimental namespace + +} // armnn namespace + diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp index b6974811ef..521854b12b 100644 --- a/src/backends/reference/test/RefEndToEndTests.cpp +++ b/src/backends/reference/test/RefEndToEndTests.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -25,6 +25,7 @@ #include <backendsCommon/test/ResizeEndToEndTestImpl.hpp> #include <backendsCommon/test/SpaceToDepthEndToEndTestImpl.hpp> #include <backendsCommon/test/SplitterEndToEndTestImpl.hpp> +#include <backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp> #include <backendsCommon/test/TransposeConvolution2dEndToEndTestImpl.hpp> #include <boost/test/unit_test.hpp> @@ -1336,6 +1337,10 @@ BOOST_AUTO_TEST_CASE(RefStridedSliceInvalidSliceEndToEndTest) StridedSliceInvalidSliceEndToEndTest(defaultBackends); } +BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedEndToEndTest) +{ + armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends); +} #endif BOOST_AUTO_TEST_SUITE_END() diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp index 6a29439cc0..ce807ee087 100644 --- a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp +++ b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -35,4 +35,24 @@ void RefStridedSliceWorkload::Execute() const GetDataTypeSize(inputDataType)); } +void RefStridedSliceWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor) +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStridedSliceWorkload_Execute_WorkingMemDescriptor"); + + const TensorInfo& inputInfo = GetTensorInfo(descriptor.m_Inputs[0]); + const TensorInfo& outputInfo = GetTensorInfo(descriptor.m_Outputs[0]); + + DataType inputDataType = inputInfo.GetDataType(); + DataType outputDataType = outputInfo.GetDataType(); + + ARMNN_ASSERT(inputDataType == outputDataType); + IgnoreUnused(outputDataType); + + StridedSlice(inputInfo, + m_Data.m_Parameters, + descriptor.m_Inputs[0]->Map(), + descriptor.m_Outputs[0]->Map(), + GetDataTypeSize(inputDataType)); +} + } // namespace armnn diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp index 44aabc0106..3e253edcd9 100644 --- a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp +++ b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -15,6 +15,7 @@ class RefStridedSliceWorkload : public BaseWorkload<StridedSliceQueueDescriptor> public: RefStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; + void ExecuteAsync(WorkingMemDescriptor& descriptor) override; }; } // namespace armnn |