ArmNN
 21.05
LoadedNetwork.cpp
Go to the documentation of this file.
1 //
2 // Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include "LoadedNetwork.hpp"
7 #include "Layer.hpp"
8 #include "Graph.hpp"
9 #include "Network.hpp"
10 #include <Processes.hpp>
11 #include "Profiling.hpp"
12 #include "HeapProfiling.hpp"
13 #include "WorkingMemHandle.hpp"
14 
16 #include <armnn/Logging.hpp>
17 #include <armnn/utility/Assert.hpp>
18 
23 
25 
26 #include <fmt/format.h>
27 #include <armnn/utility/Timer.hpp>
28 
29 namespace armnn
30 {
31 
32 using namespace std;
33 using namespace armnn::profiling;
34 
35 namespace
36 {
37 
38 template <typename ExceptionType>
39 std::string ToErrorMessage(const char * prefix, const ExceptionType & error)
40 {
41  std::stringstream ss;
42  ss << prefix << " " << error.what();
43  return ss.str();
44 }
45 
46 void AddLayerStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
47  const Layer& layer,
48  ProfilingGuid networkGuid)
49 {
50  // Add layer to the post-optimisation network structure
51  std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
52  timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
53  networkGuid,
54  layerName,
56  for (auto&& input : layer.GetInputSlots())
57  {
58  const IOutputSlot* source = input.GetConnectedOutputSlot();
59  ARMNN_ASSERT(source != NULL);
60  timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink,
61  source->GetOwningLayerGuid(),
62  layer.GetGuid());
63  }
64 }
65 
66 void AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
67  std::unique_ptr<IWorkload>& workload,
68  const Layer& layer)
69 {
70  // Add workload to the post-optimisation network structure
71  timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID);
72  timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
73  layer.GetBackendId().Get(),
75 
76  // Link the workload to the layer
77  timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
78  layer.GetGuid(),
79  workload->GetGuid(),
81 }
82 
83 } // anonymous
84 
85 std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
86  std::string& errorMessage,
87  const INetworkProperties& networkProperties,
88  profiling::ProfilingService& profilingService,
89  const NetworkId networkIdOut)
90 {
91  std::unique_ptr<LoadedNetwork> loadedNetwork;
92 
93  auto Fail = [&](const std::exception& error) -> std::unique_ptr<LoadedNetwork>
94  {
95  errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error);
96  ARMNN_LOG(error) << errorMessage;
97 
98  return std::unique_ptr<LoadedNetwork>();
99  };
100 
101  try
102  {
103  loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService, networkIdOut));
104  }
105  catch (const armnn::RuntimeException& error)
106  {
107  return Fail(error);
108  }
109  catch (const armnn::Exception& error)
110  {
111  return Fail(error);
112  }
113  catch (const std::runtime_error& error)
114  {
115  return Fail(error);
116  }
117 
118  return loadedNetwork;
119 }
120 
121 LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
122  const INetworkProperties& networkProperties,
123  profiling::ProfilingService& profilingService,
124  const NetworkId networkId) :
125  m_OptimizedNetwork(std::move(net)),
126  m_NetworkProperties(networkProperties),
127  m_NetworkId(networkId),
128  m_TensorHandleFactoryRegistry(),
129  m_ProfilingService(profilingService)
130 {
131  // Create a profiler and register it for the current thread.
132  m_Profiler = std::make_shared<IProfiler>();
134 
135  Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
136  //First create tensor handlers, backends and workload factories.
137  //Handlers are created before workloads are.
138  //Because workload creation can modify some of the handlers,
139  //(for example the splitter and concat layers).
140  for (auto&& layer : order)
141  {
142  auto const& backendId = layer->GetBackendId();
143  if (m_Backends.count(backendId) == 0)
144  {
145  auto createBackend = BackendRegistryInstance().GetFactory(backendId);
146  auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
147 
148  IBackendInternal* backend = it.first->second.get();
149 
150  if (backend->SupportsTensorAllocatorAPI())
151  {
152  auto workloadFactory = backend->CreateWorkloadFactory(
153  m_TensorHandleFactoryRegistry, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
154  static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
155  static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
156  m_WorkloadFactories.emplace(
157  std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
158  }
159  else
160  {
162  auto workloadFactory = backend->CreateWorkloadFactory(
163  memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
164 
165  m_WorkloadFactories.emplace(
166  std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
167  }
168  }
169  }
170 
171  // Create the thread pool which will have working memory handles assigned to each thread
172  // Should occur after factories are registered so thet the WorkingMemHandles can be created
173  if (m_NetworkProperties.m_NumThreads > 0 && networkProperties.m_AsyncEnabled)
174  {
175  CreateThreadPool(m_NetworkProperties.m_NumThreads);
176  }
177 
178  if (!networkProperties.m_AsyncEnabled)
179  {
180  for (auto &&layer : order)
181  {
182  auto &workloadFactory = GetWorkloadFactory(*layer);
183 
184  switch (layer->GetType())
185  {
186  case LayerType::Input:
188  {
189  // If IsImportEnabled is true then we need to set IsMemoryManaged
190  // to false when creating TensorHandles
191  layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
192  workloadFactory,
193  !m_NetworkProperties.m_ImportEnabled);
194  break;
195  }
196  default:
197  {
198  // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
199  // If Export is enabled disable memory management so we can export, otherwise we do a copy
200  if ((layer->GetNumOutputSlots() == 1) &&
201  (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
202  (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
203  {
204  layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
205  workloadFactory,
206  !m_NetworkProperties.m_ExportEnabled);
207  }
208  else
209  {
210  layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
211  }
212  }
213  }
214  }
215  }
216 
217  ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
218  std::unique_ptr<TimelineUtilityMethods> timelineUtils =
219  TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
220  if (timelineUtils)
221  {
222  timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
223  // Mark the network with a start of life event
224  timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
225  // and with the process ID
226  int processID = armnnUtils::Processes::GetCurrentId();
227  std::stringstream ss;
228  ss << processID;
229  timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID);
230  }
231 
232  //Then create workloads.
233  for (auto&& layer : order)
234  {
235  if (timelineUtils)
236  {
237  // Add layer to the post-optimisation network structure
238  AddLayerStructure(timelineUtils, *layer, networkGuid);
239  }
240 
241  const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer);
242 
243  switch (layer->GetType())
244  {
245  case LayerType::Input:
246  case LayerType::Output:
247  {
248  // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
249  break;
250  }
251  default:
252  {
253  auto workload = layer->CreateWorkload(workloadFactory);
254 
255  if (!workload)
256  {
257  const char* const layerName =
258  layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
260  fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
261  layerName, static_cast<int>(layer->GetType()), layer->GetBackendId().Get()
262  ));
263  }
264 
265  if (timelineUtils)
266  {
267  // Add workload to the post-optimisation network structure
268  AddWorkloadStructure(timelineUtils, workload, *layer);
269  }
270 
271  // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
272  // and are separated out from the other workloads
273  if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant)
274  {
275  m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
276  }
277  else
278  {
279  m_WorkloadQueue.push_back(move(workload));
280  }
281 
282  // release the constant data in the layer..
283  layer->ReleaseConstantData();
284  break;
285  }
286  }
287  }
288 
289  for (auto&& workloadFactory : m_WorkloadFactories)
290  {
291  workloadFactory.second.first->AfterWorkloadsCreated();
292  }
293 
294  if (timelineUtils)
295  {
296  // Commit to send the post-optimisation network structure
297  timelineUtils->Commit();
298  }
299 
300  if (!networkProperties.m_AsyncEnabled)
301  {
302  // Set up memory.
303  m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
304 
305  // Now that the intermediate tensor memory has been set-up,
306  // do any post allocation configuration for each workload.
307  for (auto &workload : m_WorkloadQueue)
308  {
309  workload->PostAllocationConfigure();
310  }
311  }
312  else
313  {
314  AllocateAndExecuteConstantWorkloads();
315  }
316 }
317 
318 void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
319 {
320  Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
321  for (auto&& layer : order)
322  {
323  if (layer->GetType() == LayerType::Constant)
324  {
325  const auto& outSlot = layer->GetOutputSlots()[0];
326  const auto factoryId = outSlot.GetTensorHandleFactoryId();
328  auto& workloadFactory = GetWorkloadFactory(*layer);
329 
330  layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
331  ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData();
332 
333  m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle;
334  tensorHandle->Allocate();
335 
336  WorkingMemDescriptor memDesc;
337  memDesc.m_Outputs.push_back(tensorHandle);
338  m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(memDesc);
339  }
340  }
341 }
342 
343 
345 {
346  Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
347  ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
348 
349  std::unique_ptr<TimelineUtilityMethods> timelineUtils =
350  TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
351 
352  timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
353 
354  for (auto&& layer : order)
355  {
356  // Add layer to the post-optimisation network structure
357  AddLayerStructure(timelineUtils, *layer, networkGuid);
358  switch (layer->GetType())
359  {
360  case LayerType::Input:
361  case LayerType::Output:
362  {
363  // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
364  break;
365  }
366  default:
367  {
368  for (auto& workload : m_WorkloadQueue)
369  {
370  // Add workload to the post-optimisation network structure
371  AddWorkloadStructure(timelineUtils, workload, *layer);
372  }
373  break;
374  }
375  }
376  }
377  // Commit to send the post-optimisation network structure
378  timelineUtils->Commit();
379 }
380 
382 {
383  return m_OptimizedNetwork->GetGuid();
384 }
385 
387 {
388  for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers())
389  {
390  ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
391  if (inputLayer->GetBindingId() == layerId)
392  {
393  return inputLayer->GetOutputSlot(0).GetTensorInfo();
394  }
395  }
396 
397  throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId));
398 }
399 
401 {
402  for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers())
403  {
404  ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
405  ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
406  if (outputLayer->GetBindingId() == layerId)
407  {
408  return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
409  }
410  }
411 
412  throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId));
413 }
414 
415 const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const
416 {
417  const IWorkloadFactory* workloadFactory = nullptr;
418 
419  auto it = m_WorkloadFactories.find(layer.GetBackendId());
420  if (it == m_WorkloadFactories.end())
421  {
422  throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}",
423  layer.GetBackendId().Get(),
424  layer.GetNameStr()),
425  CHECK_LOCATION());
426  }
427 
428  workloadFactory = it->second.first.get();
429 
430  ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
431 
432  std::string reasonIfUnsupported;
434  {},
435  reasonIfUnsupported,
436  m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()),
437  "Factory does not support layer");
438  IgnoreUnused(reasonIfUnsupported);
439  return *workloadFactory;
440 }
441 
442 namespace {
443 
444 // Non-copyable class owning accelerator-specific tensor data.
445 class TensorPin
446 {
447 public:
448  TensorPin(std::unique_ptr<ITensorHandle> handle, const TensorInfo& info, LayerBindingId id)
449  : m_TensorHandle(std::move(handle))
450  , m_TensorInfo(info)
451  , m_Id(id)
452  {
453  }
454 
455  ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); }
456  const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
457  LayerBindingId GetBindingId() const { return m_Id; }
458 
459 private:
460  std::unique_ptr<ITensorHandle> m_TensorHandle;
461  TensorInfo m_TensorInfo;
462  LayerBindingId m_Id;
463 };
464 
465 static const TensorPin& GetTensorPin(LayerBindingId id,
466  const std::vector<TensorPin>& pins,
467  char const* bindingPointDesc)
468 {
469  auto it = std::find_if(pins.begin(), pins.end(),
470  [id](const TensorPin& pin)
471  {
472  return pin.GetBindingId() == id;
473  });
474 
475  if (it != pins.end())
476  {
477  return *it;
478  }
479  else
480  {
481  throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id));
482  }
483 }
484 
485 // Stores data that needs to be kept accessible for the entire execution of a workload.
486 class WorkloadData
487 {
488 public:
489  WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors)
490  {
491  m_InputTensorPins.reserve(inputTensors.size());
492  m_OutputTensorPins.reserve(outputTensors.size());
493 
494  for (auto inputTensorPair : inputTensors)
495  {
496  auto inputTensor = inputTensorPair.second;
497 
498  std::unique_ptr<ITensorHandle> tensorHandle =
499  std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),inputTensor.GetMemoryArea());
500  LayerBindingId layerId = inputTensorPair.first;
501 
502  m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId);
503  }
504 
505  for (auto outputTensorPair : outputTensors)
506  {
507  auto outputTensor = outputTensorPair.second;
508 
509  std::unique_ptr<ITensorHandle> tensorHandle =
510  std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
511  LayerBindingId layerId = outputTensorPair.first;
512 
513  m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId);
514  }
515  }
516 
517  const TensorPin& GetInputTensorPin(LayerBindingId id) const
518  {
519  return GetTensorPin(id, m_InputTensorPins, "input");
520  }
521 
522  const TensorPin& GetOutputTensorPin(LayerBindingId id) const
523  {
524  return GetTensorPin(id, m_OutputTensorPins, "output");
525  }
526 
527 private:
528 
529  std::vector<TensorPin> m_InputTensorPins;
530  std::vector<TensorPin> m_OutputTensorPins;
531 };
532 
533 }
534 
536  const OutputTensors& outputTensors)
537 {
538  const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
539 
540  // Walk graph to determine the order of execution.
541  if (graph.GetNumLayers() < 2)
542  {
543  ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
544  return Status::Failure;
545  }
546 
547  // Data that must be kept alive for the entire execution of the workload.
548  WorkloadData workloadData(inputTensors, outputTensors);
549 
550  if (graph.GetNumInputs() != inputTensors.size())
551  {
552  throw InvalidArgumentException("Number of inputs provided does not match network.");
553  }
554 
555  // For each input to the network, call EnqueueInput with the data passed by the user.
556  {
558  m_InputQueue.clear();
559  m_InputQueue.reserve(graph.GetNumInputs());
560  for (const BindableLayer* inputLayer : graph.GetInputLayers())
561  {
562  const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
563  EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
564  }
565  }
566 
567  // For each output to the network, call EnqueueOutput with the data passed by the user.
568  {
570  m_OutputQueue.clear();
571  m_OutputQueue.reserve(graph.GetNumOutputs());
572  for (const BindableLayer* outputLayer : graph.GetOutputLayers())
573  {
574  const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
575  EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
576  }
577  }
578 
579  std::unique_ptr<TimelineUtilityMethods> timelineUtils =
580  TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
581  ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
582  if (timelineUtils)
583  {
584  // Add inference timeline trace if profiling is enabled.
585  ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
586  timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID);
587  timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
588  networkGuid,
589  inferenceGuid,
591  timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
592  }
593 
594  bool executionSucceeded = true;
595 
596  {
597  if (m_ProfilingService.IsProfilingEnabled())
598  {
599  m_ProfilingService.IncrementCounterValue(armnn::profiling::INFERENCES_RUN);
600  }
602  ARMNN_SCOPED_HEAP_PROFILING("Executing");
603  executionSucceeded = Execute(timelineUtils, inferenceGuid);
604  }
605 
606  if (timelineUtils)
607  {
608  // Add end of life of the inference timeline if profiling is enabled.
609  timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
610  timelineUtils->Commit();
611  }
612  return executionSucceeded ? Status::Success : Status::Failure;
613 }
614 
615 void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
616 {
617  if (layer.GetType() != LayerType::Input)
618  {
619  throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
620  }
621 
622  if (tensorHandle == nullptr)
623  {
624  throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL");
625  }
626 
627  InputQueueDescriptor inputQueueDescriptor;
629 
630  inputQueueDescriptor.m_Inputs.push_back(tensorHandle);
631  info.m_InputTensorInfos.push_back(tensorInfo);
632 
633  ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output");
634  const OutputHandler& handler = layer.GetOutputHandler();
635  const TensorInfo& outputTensorInfo = handler.GetTensorInfo();
636  ITensorHandle* outputTensorHandle = handler.GetData();
637  ARMNN_ASSERT_MSG(outputTensorHandle != nullptr,
638  "Data should have been allocated.");
639  inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle);
640  info.m_OutputTensorInfos.push_back(outputTensorInfo);
641 
642  MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
643  bool needMemCopy = true;
644  if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
645  {
646  if(CheckFlag(importFlags, m_NetworkProperties.m_InputSource))
647  {
648  needMemCopy = false;
649  // This assumes a CPU Tensor handle
650  void* mem = tensorHandle->Map(false);
651  if (outputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
652  {
653  tensorHandle->Unmap();
654  return; // No need for a workload since the import has been done.
655  }
656  tensorHandle->Unmap();
657  throw MemoryImportException("EnqueueInput: Memory Import failed");
658  }
659  }
660  if (needMemCopy)
661  {
662  // Create a mem copy workload for input since we did not import
663  std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);
664 
665  ARMNN_ASSERT_MSG(inputWorkload, "No input workload created");
666 
667  std::unique_ptr<TimelineUtilityMethods> timelineUtils =
668  TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
669  if (timelineUtils)
670  {
671  // Add Input Workload to the post-optimisation network structure
672  AddWorkloadStructure(timelineUtils, inputWorkload, layer);
673  timelineUtils->Commit();
674  }
675 
676  m_InputQueue.push_back(move(inputWorkload));
677  }
678 }
679 
680 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
681 {
682  if (layer.GetType() != LayerType::Output)
683  {
684  throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
685  }
686 
687  if (tensorHandle == nullptr)
688  {
689  throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL");
690  }
691 
692  OutputQueueDescriptor outputQueueDescriptor;
694 
695  outputQueueDescriptor.m_Outputs.push_back(tensorHandle);
696  info.m_OutputTensorInfos.push_back(tensorInfo);
697 
698  ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
699 
700  // Gets the output handler from the previous node.
701  const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
702 
703  const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
704  ITensorHandle* inputTensorHandle = outputHandler.GetData();
705  ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
706 
707  // Try import the output tensor.
708  // Note: We can only import the output pointer if all of the following hold true:
709  // a) The imported pointer is aligned sufficiently
710  // b) The tensor has zero padding
711  // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
712  // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
713  // e) m_IsExportEnabled must be set to true
714  bool needMemCopy = true;
715  if (m_NetworkProperties.m_ExportEnabled &&
716  (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
717  {
718  if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
719  {
720  MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
721  if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
722  {
723  needMemCopy = false;
724  void *mem = tensorHandle->Map(false);
725  bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
726  tensorHandle->Unmap();
727 
728  if (importOk)
729  {
730  // Insert synchronization workload
731  MemSyncQueueDescriptor syncDesc;
732  syncDesc.m_Inputs.push_back(inputTensorHandle);
733  info.m_InputTensorInfos.push_back(inputTensorInfo);
734  auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
735  ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
736  m_OutputQueue.push_back(move(syncWorkload));
737  }
738  else
739  {
740  throw MemoryExportException("EnqueueOutput: Memory Export failed");
741  }
742  }
743  }
744  }
745  if (needMemCopy)
746  {
747  // If we got here then we didn't export the memory, so add an output workload which performs a memcopy.
748  outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle);
749  info.m_InputTensorInfos.push_back(inputTensorInfo);
750 
751  std::unique_ptr<IWorkload> outputWorkload =
752  std::make_unique<CopyMemGenericWorkload>(outputQueueDescriptor, info);
753  ARMNN_ASSERT_MSG(outputWorkload, "No output workload created");
754 
755  std::unique_ptr<TimelineUtilityMethods> timelineUtils =
756  TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
757  if (timelineUtils)
758  {
759  // Add Output Workload to the post-optimisation network structure
760  AddWorkloadStructure(timelineUtils, outputWorkload, layer);
761  timelineUtils->Commit();
762  }
763 
764  m_OutputQueue.push_back(move(outputWorkload));
765  }
766 }
767 
768 void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
769 {
770  ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");
771 
772  // this unused parameter makes sure we can only call this function with a valid lock
773  IgnoreUnused(lock);
774 
775  if (m_IsWorkingMemAllocated)
776  {
777  return;
778  }
779  for (auto&& workloadFactory : m_WorkloadFactories)
780  {
781  IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
782  if (memoryManager)
783  {
784  memoryManager->Acquire();
785  }
786  }
787  m_TensorHandleFactoryRegistry.AquireMemory();
788  m_IsWorkingMemAllocated = true;
789 }
790 
792 {
793  std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
794  if (!m_IsWorkingMemAllocated)
795  {
796  return;
797  }
798  // Informs the memory managers to release memory in it's respective memory group
799  for (auto&& workloadFactory : m_WorkloadFactories)
800  {
801  IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
802  if (memoryManager)
803  {
804  memoryManager->Release();
805  }
806  }
807  m_TensorHandleFactoryRegistry.ReleaseMemory();
808  m_IsWorkingMemAllocated = false;
809 }
810 
811 bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
812  profiling::ProfilingGuid inferenceGuid)
813 {
814  bool success = true;
815 
816  auto Fail = [&](const std::exception& error)
817  {
818  ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
819  success = false;
820  };
821 
822  try
823  {
824  std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
825  AllocateWorkingMemory(lockGuard);
826 
827  ProfilingDynamicGuid workloadInferenceID(0);
828  auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
829  {
830  for (auto& workload : queue)
831  {
832  if(timelineUtils)
833  {
834  workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
835  inferenceGuid);
836  }
837  workload->Execute();
838  if(timelineUtils)
839  {
840  timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
841  }
842  }
843  };
844 
845  ExecuteQueue(m_InputQueue);
846  ExecuteQueue(m_WorkloadQueue);
847  ExecuteQueue(m_OutputQueue);
848  }
849  catch (const RuntimeException& error)
850  {
851  Fail(error);
852  }
853  catch (const std::runtime_error& error)
854  {
855  Fail(error);
856  }
857 
858  return success;
859 }
860 
861 void LoadedNetwork::CreateThreadPool(std::size_t numThreads)
862 {
863 
864  for (auto i = 0u; i < numThreads; ++i)
865  {
866  std::unique_ptr<IWorkingMemHandle> workingMemHandle = CreateWorkingMemHandle(m_NetworkId);
867  m_Threads.emplace_back(
868  std::make_unique<std::thread>(
869  &LoadedNetwork::ProcessExecPriorities,
870  this,
871  std::move(workingMemHandle)
872  )
873  );
874  }
875 }
876 
877 void LoadedNetwork::TerminateThreadPool() noexcept
878 {
879  {
880  std::unique_lock<std::mutex> threadPoolLock(m_ThreadPoolMutex);
881  m_TerminatePool = true;
882  }
883 
884  m_ThreadPoolEvent.notify_all();
885 
886  for (auto &thread : m_Threads)
887  {
888  thread->join();
889  }
890 }
891 
892 void LoadedNetwork::Schedule(const InputTensors& inputTensors,
893  const OutputTensors& outputTensors,
894  const QosExecPriority priority,
895  std::shared_ptr<IAsyncExecutionCallback> cb)
896 {
897  // Group execution parameters so that they can be easily added to the queue
898  ExecutionTuple groupExecParams = std::make_tuple(inputTensors, outputTensors, cb);
899  std::shared_ptr<ExecutionTuple> operation = make_shared<ExecutionTuple>(groupExecParams);
900 
901  // Add a message to the queue and notify the request thread
902  std::unique_lock<std::mutex> lock(m_ThreadPoolMutex);
903  switch (priority) {
905  m_HighPriorityQueue.push(operation);
906  break;
908  m_LowPriorityQueue.push(operation);
909  break;
911  default:
912  m_MediumPriorityQueue.push(operation);
913  }
914  m_ThreadPoolEvent.notify_one();
915 }
916 
917 void LoadedNetwork::ProcessExecPriorities(std::unique_ptr<IWorkingMemHandle> workingMemHandle)
918 {
919  int expireRate = EXPIRE_RATE;
920  int highPriorityCount = 0;
921  int mediumPriorityCount = 0;
922 
923  IWorkingMemHandle& workingMemHandleRef = *workingMemHandle.get();
924 
925  while (true)
926  {
927  std::shared_ptr<ExecutionTuple> currentExecInProgress(nullptr);
928  {
929  // Wait for a message to be added to the queue
930  // This is in a separate scope to minimise the lifetime of the lock
931  std::unique_lock<std::mutex> lock(m_ThreadPoolMutex);
932 
933  m_ThreadPoolEvent.wait(lock,
934  [=] {
935  return m_TerminatePool || !m_HighPriorityQueue.empty() ||
936  !m_MediumPriorityQueue.empty() || !m_LowPriorityQueue.empty();
937  });
938 
939  if (m_TerminatePool && m_HighPriorityQueue.empty() && m_MediumPriorityQueue.empty() &&
940  m_LowPriorityQueue.empty())
941  {
942  break;
943  }
944 
945  // Get the message to process from the front of each queue based on priority from high to low
946  // Get high priority first if it does not exceed the expire rate
947  if (!m_HighPriorityQueue.empty() && highPriorityCount < expireRate)
948  {
949  currentExecInProgress = m_HighPriorityQueue.front();
950  m_HighPriorityQueue.pop();
951  highPriorityCount += 1;
952  }
953  // If high priority queue is empty or the count exceeds the expire rate, get medium priority message
954  else if (!m_MediumPriorityQueue.empty() && mediumPriorityCount < expireRate)
955  {
956  currentExecInProgress = m_MediumPriorityQueue.front();
957  m_MediumPriorityQueue.pop();
958  mediumPriorityCount += 1;
959  // Reset high priority count
960  highPriorityCount = 0;
961  }
962  // If medium priority queue is empty or the count exceeds the expire rate, get low priority message
963  else if (!m_LowPriorityQueue.empty())
964  {
965  currentExecInProgress = m_LowPriorityQueue.front();
966  m_LowPriorityQueue.pop();
967  // Reset high and medium priority count
968  highPriorityCount = 0;
969  mediumPriorityCount = 0;
970  }
971  else
972  {
973  // Reset high and medium priority count
974  highPriorityCount = 0;
975  mediumPriorityCount = 0;
976  continue;
977  }
978  }
979 
980  // invoke the asynchronous execution method
981  auto inputTensors = std::get<0>(*currentExecInProgress);
982  auto outputTensors = std::get<1>(*currentExecInProgress);
983  auto cb = std::get<2>(*currentExecInProgress);
984 
985  // Get time at start of inference
987 
988  try // executing the inference
989  {
990  // Execute and populate the time at end of inference in the callback
991  Execute(inputTensors, outputTensors, workingMemHandleRef) == Status::Success ?
992  cb->Notify(Status::Success, std::make_pair(startTime, armnn::GetTimeNow())) :
993  cb->Notify(Status::Failure, std::make_pair(startTime, armnn::GetTimeNow()));
994  }
995  catch (const RuntimeException& error)
996  {
997  cb->Notify(Status::Failure, std::make_pair(startTime, armnn::GetTimeNow()));
998  }
999  }
1000 }
1001 
1002 void LoadedNetwork::EnqueueInput(const BindableLayer& layer,
1003  const ConstTensor& inputTensor,
1004  WorkingMemHandle& context)
1005 {
1006  if (layer.GetType() != LayerType::Input)
1007  {
1008  throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
1009  }
1010  LayerGuid id = layer.GetGuid();
1011  WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id);
1012 
1013  MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags();
1014  if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
1015  {
1016  if (CheckFlag(importFlags, m_NetworkProperties.m_InputSource) )
1017  {
1018  // This assumes a CPU Tensor handle
1019  std::unique_ptr<ITensorHandle> tensorHandle =
1020  std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),
1021  inputTensor.GetMemoryArea());
1022 
1023  void* mem = tensorHandle->Map(false);
1024  if (descriptor.m_Outputs[0]->Import(mem, m_NetworkProperties.m_InputSource))
1025  {
1026  tensorHandle->Unmap();
1027  return;
1028  }
1029  tensorHandle->Unmap();
1030  throw MemoryImportException("EnqueueInput: Memory Import failed");
1031  }
1032  else
1033  {
1034  throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
1035  }
1036  }
1037  else
1038  {
1039  std::unique_ptr<ITensorHandle> tensorHandle =
1040  std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());
1041 
1042  auto copyFunc = [](void* dst, const void* src, size_t size)
1043  {
1044  memcpy(dst, src, size);
1045  };
1046 
1047  for (const auto& input : descriptor.m_Outputs)
1048  {
1049  CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc);
1050  }
1051  }
1052 }
1053 
1054 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle)
1055 {
1056  if (layer.GetType() != LayerType::Output)
1057  {
1058  throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
1059  }
1060  ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
1061 
1062  LayerGuid id = layer.GetGuid();
1063  WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id);
1064 
1065  ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0];
1066  ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
1067 
1068  // Try import the output tensor.
1069  // Note: We can only import the output pointer if all of the following hold true:
1070  // a) The imported pointer is aligned sufficiently
1071  // b) The tensor has zero padding
1072  // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
1073  // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
1074  // e) m_IsExportEnabled must be set to true
1075  if (m_NetworkProperties.m_ExportEnabled &&
1076  (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
1077  {
1078  if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
1079  {
1080  MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
1081  if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
1082  {
1083  std::unique_ptr<ITensorHandle> tensorHandle =
1084  std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
1085  outputTensor.GetMemoryArea());
1086 
1087  void* mem = tensorHandle->Map(false);
1088  bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
1089  tensorHandle->Unmap();
1090 
1091  if (importOk)
1092  {
1093  ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
1094  inputTensorHandle->Map(true);
1095  inputTensorHandle->Unmap();
1096  }
1097  else
1098  {
1099  throw MemoryExportException("EnqueueOutput: Memory Export failed");
1100  }
1101  }
1102  else
1103  {
1104  throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export");
1105  }
1106  }
1107  else
1108  {
1109  throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer");
1110  }
1111  }
1112  else
1113  {
1114  auto copyFunc = [](void* dst, const void* src, size_t size)
1115  {
1116  memcpy(dst, src, size);
1117  };
1118 
1119  std::unique_ptr<ITensorHandle> tensorHandle =
1120  std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
1121  outputTensor.GetMemoryArea());
1122 
1123  CopyTensorContentsGeneric(inputTensorHandle, tensorHandle.get(), copyFunc);
1124  }
1125 }
1126 
1127 
1128 const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors)
1129 {
1130  for (auto inputTensorPair : inputTensors)
1131  {
1132  LayerBindingId id = inputTensorPair.first;
1133  if (id == layerId)
1134  {
1135  return inputTensorPair.second;
1136  }
1137  }
1138  throw InvalidArgumentException("Input does not exist.");
1139 }
1140 
1141 const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors)
1142 {
1143  for (auto outputTensorPair : outputTensors)
1144  {
1145  LayerBindingId id = outputTensorPair.first;
1146  if (id == layerId)
1147  {
1148  return outputTensorPair.second;
1149  }
1150  }
1151  throw InvalidArgumentException("Output does not exist.");
1152 }
1153 
1155  const OutputTensors& outputTensors,
1156  IWorkingMemHandle& iWorkingMemHandle)
1157 {
1158  const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
1159 
1160  // Walk graph to determine the order of execution.
1161  if (graph.GetNumLayers() < 2)
1162  {
1163  ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
1164  return Status::Failure;
1165  }
1166 
1167  if (graph.GetNumInputs() != inputTensors.size())
1168  {
1169  throw InvalidArgumentException("Number of inputs provided does not match network.");
1170  }
1171 
1172  std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils =
1174  profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
1175  if (timelineUtils)
1176  {
1177  // Add inference timeline trace if profiling is enabled.
1178  profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
1179  timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID);
1180  timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink,
1181  networkGuid,
1182  inferenceGuid,
1184  timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
1185  }
1186 
1187  bool executionSucceeded = true;
1188 
1189  if (timelineUtils)
1190  {
1191  // Add end of life of the inference timeline if profiling is enabled.
1192  timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
1193  timelineUtils->Commit();
1194  }
1195  WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
1196  std::lock_guard<std::mutex> lockGuard(workingMemHandle.GetMutex());
1197 
1198  if (!workingMemHandle.IsAllocated())
1199  {
1200  workingMemHandle.Allocate();
1201  }
1202 
1203  {
1205  for (const BindableLayer* inputLayer : graph.GetInputLayers())
1206  {
1207  EnqueueInput(*inputLayer, GetInputTensor(inputLayer->GetBindingId(), inputTensors), workingMemHandle);
1208  }
1209  }
1210 
1211  auto Fail = [&](const std::exception& error)
1212  {
1213  ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
1214  executionSucceeded = false;
1215  };
1216  profiling::ProfilingDynamicGuid workloadInferenceID(0);
1217 
1218  try
1219  {
1220  for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
1221  {
1222  auto& workload = m_WorkloadQueue[i];
1223  if (timelineUtils)
1224  {
1225  workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
1226  inferenceGuid);
1227  }
1228  workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i));
1229 
1230  if (timelineUtils)
1231  {
1232  timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
1233  }
1234  }
1235  }
1236  catch (const RuntimeException& error)
1237  {
1238  Fail(error);
1239  }
1240  catch (const std::runtime_error& error)
1241  {
1242  Fail(error);
1243  }
1244  // For each output to the network, call EnqueueOutput with the data passed by the user.
1245  {
1247  for (const BindableLayer *outputLayer : graph.GetOutputLayers())
1248  {
1249  EnqueueOutput(*outputLayer, GetOutputTensor(outputLayer->GetBindingId(), outputTensors), workingMemHandle);
1250  }
1251  }
1252 
1253  return executionSucceeded ? Status::Success : Status::Failure;
1254 }
1255 
1256 /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
1257 /// overlapped Execution by calling this function from different threads.
1258 std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
1259 {
1260  Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
1261  std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > tensorHandleMap;
1262  std::vector<WorkingMemDescriptor> workingMemDescriptors;
1263  std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
1264  TensorHandleFactoryRegistry tensorHandleFactoryRegistry;
1265  WorkloadFactoryMap workloadFactoryMap;
1266 
1267  std::vector<std::shared_ptr<IMemoryManager>> memoryManagers;
1268 
1269  for (auto const& backend : m_Backends)
1270  {
1271  if (backend.second->SupportsTensorAllocatorAPI())
1272  {
1273  backend.second->RegisterTensorHandleFactories(
1274  tensorHandleFactoryRegistry,
1275  static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
1276  static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
1277  memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back());
1278  }
1279  else
1280  {
1281  std::shared_ptr<IMemoryManager> memoryManager = backend.second->CreateMemoryManager();
1282  auto workloadFactory = backend.second->CreateWorkloadFactory(
1283  memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
1284 
1285  workloadFactoryMap.emplace(
1286  std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager)));
1287  memoryManagers.emplace_back(memoryManager);
1288  }
1289  }
1290 
1291  auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged)
1292  {
1293  ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
1294  const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
1295 
1296  if (factoryId == ITensorHandleFactory::LegacyFactoryId)
1297  {
1298  BackendId id = layer->GetBackendId();
1300  return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged);
1302  }
1303  else
1304  {
1305  ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId);
1306  ARMNN_ASSERT(handleFactory);
1307  return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
1308  }
1309  };
1310 
1311  std::unordered_map<const ITensorHandle*, unsigned int> handleReferenceCounts;
1312  for (auto&& layer : order)
1313  {
1314  WorkingMemDescriptor workingMemDescriptor;
1315 
1316  // Constant layers execution and management is handled during loaded network construction
1317  if (layer->GetType() == LayerType::Constant)
1318  {
1319  continue;
1320  }
1321  bool isMemoryManaged = true;
1322  bool isInputLayer = true;
1323  // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
1324  // If Export is enabled disable memory management so we can export, otherwise we do a copy
1325  if ((layer->GetNumOutputSlots() == 1) &&
1326  (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
1327  (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
1328  {
1329  isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
1330  }
1331  else if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport)
1332  {
1333  // Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors
1334  // However we will still need to manage the tensorHandle
1335  isInputLayer = false;
1336  isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
1337  }
1338 
1339  // Create a tensor handle for each output slot of a layer
1340  // Once we create it, we start managing its lifetime
1341  for (auto& slot : layer->GetOutputSlots())
1342  {
1343  tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged));
1344  ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get();
1345 
1346  workingMemDescriptor.m_Outputs.push_back(tensorHandle);
1347  tensorHandle->Manage();
1348  unsigned int numConnections = slot.GetNumConnections();
1349  ARMNN_ASSERT(numConnections != 0);
1350 
1351  handleReferenceCounts[tensorHandle] = numConnections;
1352  }
1353  // Loop through the input slots in the same layer and decrement the reference counter associated
1354  // to each tensor handle we encounter.
1355  // Once it reaches zero, the lifetime of the tensor handle has ended, and we mark it's memory as available
1356  // so that the next tensor handle with a non overlapping lifetime can share it's memory.
1357  for (auto& slot : layer->GetInputSlots())
1358  {
1359  ARMNN_ASSERT(slot.GetConnection());
1360  auto outputSlot = slot.GetConnectedOutputSlot();
1361  auto key = outputSlot->GetOwningLayer().GetGuid();
1362 
1363  // Constant layers execution and management is handled during loaded network construction
1364  auto found = m_ConstantTensorHandles.find(key);
1365  if (found != m_ConstantTensorHandles.end())
1366  {
1367  workingMemDescriptor.m_Inputs.push_back(found->second);
1368  continue;
1369  }
1370 
1371  auto search = tensorHandleMap.find(key);
1372  unsigned int index = outputSlot->CalculateIndexOnOwner();
1373  ITensorHandle* inputTensorHandle = search->second[index].get();
1374  workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
1375  --handleReferenceCounts.at(inputTensorHandle);
1376  if (handleReferenceCounts.at(inputTensorHandle) == 0u)
1377  {
1378  // Stop managing lifetime of tensor handle
1379  inputTensorHandle->Allocate();
1380  handleReferenceCounts.erase(inputTensorHandle);
1381  }
1382  }
1383  workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
1384 
1385  // Input layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors
1386  // However we will still need to manage the tensorHandle
1387  if (isInputLayer)
1388  {
1389  workingMemDescriptors.push_back(workingMemDescriptor);
1390  }
1391  }
1392 
1393  return std::make_unique<WorkingMemHandle>(networkId,
1394  workingMemDescriptors,
1395  workingMemDescriptorMap,
1396  memoryManagers,
1397  std::move(tensorHandleMap));
1398 }
1399 
1401 {
1402  for (auto&& workloadPtr: m_WorkloadQueue)
1403  {
1404  workloadPtr.get()->RegisterDebugCallback(func);
1405  }
1406 }
1407 
1408 }
static ARMNN_DLLEXPORT ProfilingStaticGuid INFERENCE_GUID
std::unique_ptr< IWorkingMemHandle > CreateWorkingMemHandle(NetworkId networkId)
Create a new unique WorkingMemHandle object.
virtual bool Import(void *memory, MemorySource source)
Import externally allocated memory.
FactoryFunction GetFactory(const BackendId &id) const
unsigned int GetNumInputSlots() const override
Returns the number of connectable input slots.
Definition: Layer.hpp:313
std::chrono::high_resolution_clock::time_point HighResolutionClock
Define a timer and associated inference ID for recording execution times.
Definition: Types.hpp:319
constexpr unsigned int EXPIRE_RATE
Variable to control expire rate of priority queue.
Definition: Types.hpp:25
static std::unique_ptr< TimelineUtilityMethods > GetTimelineUtils(ProfilingService &profilingService)
static ProfilerManager & GetInstance()
Definition: Profiling.cpp:489
#define ARMNN_NO_DEPRECATE_WARN_BEGIN
Definition: Deprecated.hpp:33
virtual unsigned int GetImportFlags() const
Get flags describing supported import sources.
const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors &outputTensors)
WorkingMemDescriptor & GetWorkingMemDescriptorAt(unsigned int id) override
Get the WorkingMemDescriptor at an index.
Strongly typed guids to distinguish between those generated at runtime, and those that are statically...
Definition: Types.hpp:371
virtual void Allocate()=0
Indicate to the memory manager that this resource is no longer active.
TensorInfo GetInputTensorInfo(LayerBindingId layerId) const
#define ARMNN_LOG(severity)
Definition: Logging.hpp:202
virtual void Manage()=0
Indicate to the memory manager that this resource is active.
BackendRegistry & BackendRegistryInstance()
std::vector< std::pair< LayerBindingId, class ConstTensor > > InputTensors
Definition: Tensor.hpp:340
unsigned int MemorySourceFlags
MemoryType GetMemoryArea() const
Definition: Tensor.hpp:292
std::chrono::high_resolution_clock::time_point GetTimeNow()
Definition: Timer.hpp:14
size_t GetNumOutputs() const
Definition: Graph.hpp:181
TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const
Copyright (c) 2021 ARM Limited and Contributors.
void IgnoreUnused(Ts &&...)
const std::vector< InputSlot > & GetInputSlots() const
Definition: Layer.hpp:237
std::function< void(LayerGuid guid, unsigned int slotIndex, ITensorHandle *tensorHandle)> DebugCallbackFunction
Define the type of callback for the Debug layer to call.
Definition: Types.hpp:316
unsigned int GetNumOutputSlots() const override
Returns the number of connectable output slots.
Definition: Layer.hpp:314
static ARMNN_DLLEXPORT ProfilingStaticGuid WORKLOAD_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid ARMNN_PROFILING_EOL_EVENT_CLASS
#define ARMNN_SCOPED_PROFILING_EVENT(backendId, name)
Definition: Profiling.hpp:173
int LayerBindingId
Type of identifiers for bindable layers (inputs, outputs).
Definition: Types.hpp:243
std::tuple< InputTensors, OutputTensors, std::shared_ptr< IAsyncExecutionCallback > > ExecutionTuple
static ARMNN_DLLEXPORT ProfilingStaticGuid ARMNN_PROFILING_SOL_EVENT_CLASS
A tensor defined by a TensorInfo (shape and data type) and a mutable backing store.
Definition: Tensor.hpp:306
void Schedule(const InputTensors &inputTensors, const OutputTensors &outputTensors, const QosExecPriority priority, std::shared_ptr< IAsyncExecutionCallback > cb)
Schedule an asynchronous execution on the loaded network.
Status Execute(const InputTensors &inputTensors, const OutputTensors &outputTensors, IWorkingMemHandle &workingMemHandle)
Thread safe execution of the loaded network.
virtual IWorkloadFactoryPtr CreateWorkloadFactory(const IMemoryManagerSharedPtr &memoryManager=nullptr) const =0
std::vector< TensorInfo > m_InputTensorInfos
static ARMNN_DLLEXPORT ProfilingStaticGuid LAYER_GUID
WorkingMemDescriptor & GetWorkingMemDescriptor(LayerGuid id) override
Get the WorkingMemDescriptor for a Layer. The mutex must be locked.
#define ARMNN_NO_DEPRECATE_WARN_END
Definition: Deprecated.hpp:34
#define ARMNN_ASSERT_MSG(COND, MSG)
Definition: Assert.hpp:15
bool SupportsTensorAllocatorAPI() const
std::shared_ptr< IMemoryManager > IMemoryManagerSharedPtr
#define ARMNN_SCOPED_HEAP_PROFILING(TAG)
static ARMNN_DLLEXPORT ProfilingStaticGuid EXECUTION_OF_GUID
int NetworkId
Definition: IRuntime.hpp:22
A tensor defined by a TensorInfo (shape and data type) and an immutable backing store.
Definition: Tensor.hpp:314
std::vector< std::pair< LayerBindingId, class Tensor > > OutputTensors
Definition: Tensor.hpp:341
const std::string & GetNameStr() const
Definition: Layer.hpp:220
LayerType GetType() const override
Returns the armnn::LayerType of this layer.
Definition: Layer.hpp:265
Status
enumeration
Definition: Types.hpp:30
#define ARMNN_ASSERT(COND)
Definition: Assert.hpp:14
std::vector< TensorInfo > m_OutputTensorInfos
static bool IsLayerSupported(const BackendId &backendId, const IConnectableLayer &layer, Optional< DataType > dataType, std::string &outReasonIfUnsupported)
std::vector< std::unique_ptr< IWorkload > > WorkloadQueue
const TensorInfo & GetInfo() const
Definition: Tensor.hpp:282
#define CHECK_LOCATION()
Definition: Exceptions.hpp:197
const BackendId & GetBackendId() const
Definition: Layer.hpp:269
void Allocate() override
Allocate the backing memory required for execution.
const std::vector< OutputSlot > & GetOutputSlots() const
Definition: Layer.hpp:238
std::mutex & GetMutex() override
Get a mutex which can be used for synchronizing access to the WorkingMemHandle object.
OutputLayersAccessor GetOutputLayers() const
Returns a wrapper object with begin(), end() methods to iterate over the output layers in a range-bas...
Definition: Graph.hpp:189
static ARMNN_DLLEXPORT ProfilingStaticGuid NETWORK_GUID
Status EnqueueWorkload(const InputTensors &inputTensors, const OutputTensors &outputTensors)
Single thread execution of the loaded network.
void RegisterProfiler(IProfiler *profiler)
Definition: Profiling.cpp:496
virtual const void * Map(bool blocking=true) const =0
Map the tensor data for access.
profiling::ProfilingGuid GetNetworkGuid()
virtual void Unmap() const =0
Unmap the tensor data.
bool IsAllocated() override
IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked...
std::vector< ITensorHandle * > m_Outputs
Base class for all ArmNN exceptions so that users can filter to just those.
Definition: Exceptions.hpp:46
const OutputHandler & GetOutputHandler(unsigned int i=0) const
Definition: Layer.hpp:225
const std::string & Get() const
Definition: BackendId.hpp:136
void RegisterDebugCallback(const DebugCallbackFunction &func)
ITensorHandleFactory * GetFactory(ITensorHandleFactory::FactoryId id) const
Find a TensorHandleFactory by Id Returns nullptr if not found.
Contains information about inputs and outputs to a layer.
QosExecPriority
Definition: Types.hpp:60
std::vector< std::shared_ptr< IMemoryManager > > & GetMemoryManagers()
bool CheckFlag(MemorySourceFlags flags, MemorySource source)
void CopyTensorContentsGeneric(const ITensorHandle *srcTensor, ITensorHandle *dstTensor, CopyFunc copy)
Graph & TopologicalSort()
Sorts layers in topological order and return this.
Definition: Graph.hpp:177
InputLayersAccessor GetInputLayers() const
Returns a wrapper object with begin(), end() methods to iterate over the input layers in a range-base...
Definition: Graph.hpp:185
std::vector< ITensorHandle * > m_Inputs
static ARMNN_DLLEXPORT ProfilingStaticGuid PROCESS_ID_GUID
size_t GetNumLayers() const
Definition: Graph.hpp:191
virtual ARMNN_NO_DEPRECATE_WARN_END IMemoryManagerUniquePtr CreateMemoryManager() const
const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors &inputTensors)
const TensorInfo & GetTensorInfo(const ITensorHandle *tensorHandle)
float32 helpers
size_t GetNumInputs() const
Definition: Graph.hpp:180
virtual std::unique_ptr< ITensorHandle > CreateTensorHandle(const TensorInfo &tensorInfo) const =0
static std::unique_ptr< LoadedNetwork > MakeLoadedNetwork(std::unique_ptr< IOptimizedNetwork > net, std::string &errorMessage, const INetworkProperties &networkProperties, profiling::ProfilingService &profilingService, const NetworkId networkIdOut)
static ARMNN_DLLEXPORT ProfilingStaticGuid BACKENDID_GUID
static const FactoryId LegacyFactoryId
static ARMNN_DLLEXPORT ProfilingStaticGuid CHILD_GUID
LayerGuid GetGuid() const final
Returns the unique id of the layer.
Definition: Layer.hpp:322