ArmNN
 22.05.01
EndToEndTestImpl.hpp
Go to the documentation of this file.
1 //
2 // Copyright © 2017 Arm Ltd. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 #pragma once
6 
7 #include <CommonTestUtils.hpp>
8 
9 #include <armnn/Descriptors.hpp>
10 #include <armnn/INetwork.hpp>
11 #include <armnn/IRuntime.hpp>
12 
13 #include <Profiling.hpp>
15 #include <ResolveType.hpp>
16 
17 #include <doctest/doctest.h>
18 
19 #include <vector>
20 
21 namespace
22 {
23 
24 using namespace armnn;
25 
26 template<typename T>
27 bool ConstantUsageTest(const std::vector<BackendId>& computeDevice,
28  const TensorInfo& commonTensorInfo,
29  const std::vector<T>& inputData,
30  const std::vector<T>& constantData,
31  const std::vector<T>& expectedOutputData)
32 {
33  // Create runtime in which test will run
35  IRuntimePtr runtime(IRuntime::Create(options));
36 
37  // Builds up the structure of the network.
39 
40  IConnectableLayer* input = net->AddInputLayer(0);
41  IConnectableLayer* constant = net->AddConstantLayer(ConstTensor(commonTensorInfo, constantData));
42  IConnectableLayer* add = net->AddAdditionLayer();
43  IConnectableLayer* output = net->AddOutputLayer(0);
44 
45  input->GetOutputSlot(0).Connect(add->GetInputSlot(0));
46  constant->GetOutputSlot(0).Connect(add->GetInputSlot(1));
47  add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
48 
49  // Sets the tensors in the network.
50  input->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
51  constant->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
52  add->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
53 
54  // optimize the network
55  IOptimizedNetworkPtr optNet = Optimize(*net, computeDevice, runtime->GetDeviceSpec());
56 
57  // Loads it into the runtime.
58  NetworkId netId;
59  runtime->LoadNetwork(netId, std::move(optNet));
60 
61  // Creates structures for input & output.
62  std::vector<T> outputData(inputData.size());
63 
64  InputTensors inputTensors
65  {
66  {0, ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}
67  };
68  OutputTensors outputTensors
69  {
70  {0, Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
71  };
72 
73  // Does the inference.
74  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
75 
76  // Checks the results.
77  return outputData == expectedOutputData;
78 }
79 
80 inline bool ConstantUsageFloat32Test(const std::vector<BackendId>& backends)
81 {
82  TensorInfo commonTensorInfo({ 2, 3 }, DataType::Float32);
83  commonTensorInfo.SetConstant(true);
84 
85  return ConstantUsageTest(backends,
86  commonTensorInfo,
87  std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // Input.
88  std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // Const input.
89  std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f } // Expected output.
90  );
91 }
92 
93 inline bool ConstantUsageUint8Test(const std::vector<BackendId>& backends)
94 {
95  TensorInfo commonTensorInfo({ 2, 3 }, DataType::QAsymmU8);
96 
97  const float scale = 0.023529f;
98  const int8_t offset = -43;
99 
100  commonTensorInfo.SetQuantizationScale(scale);
101  commonTensorInfo.SetQuantizationOffset(offset);
102  commonTensorInfo.SetConstant(true);
103 
104  return ConstantUsageTest(backends,
105  commonTensorInfo,
106  armnnUtils::QuantizedVector<uint8_t>({ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, scale, offset), // Input.
107  armnnUtils::QuantizedVector<uint8_t>({ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, scale, offset), // Const input.
108  armnnUtils::QuantizedVector<uint8_t>({ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }, scale, offset) // Expected output.
109  );
110 }
111 
112 // Utility function to find the number of instances of a substring within a string.
113 int SubStringCounter(std::string& string, std::string&& substring)
114 {
115  std::size_t found = 0;
116  int count = 0;
117  // Look for the substring starting from where we last found the substring
118  while((found = string.find(substring, found)) != std::string::npos)
119  {
120  count++;
121  // Offset by substring length to avoid finding the same substring twice
122  found += substring.length();
123  }
124  return count;
125 }
126 
127 template<DataType ArmnnIType, DataType ArmnnOType,
128  typename TInput = ResolveType<ArmnnIType>, typename TOutput = ResolveType<ArmnnOType>>
129 void EndToEndLayerTestImpl(INetworkPtr network,
130  const std::map<int, std::vector<TInput>>& inputTensorData,
131  const std::map<int, std::vector<TOutput>>& expectedOutputData,
132  std::vector<BackendId> backends,
133  float tolerance = 0.000001f)
134 {
135  // Create runtime in which test will run
137  IRuntimePtr runtime(IRuntime::Create(options));
138 
139  // optimize the network
140  IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec());
141 
142  // Loads it into the runtime.
143  NetworkId netId;
144  runtime->LoadNetwork(netId, std::move(optNet));
145 
146  InputTensors inputTensors;
147  inputTensors.reserve(inputTensorData.size());
148  for (auto&& it : inputTensorData)
149  {
150  inputTensors.push_back({it.first,
151  ConstTensor(runtime->GetInputTensorInfo(netId, it.first), it.second.data())});
152  }
153  OutputTensors outputTensors;
154  outputTensors.reserve(expectedOutputData.size());
155  std::map<int, std::vector<TOutput>> outputStorage;
156  for (auto&& it : expectedOutputData)
157  {
158  std::vector<TOutput> out(it.second.size());
159  outputStorage.emplace(it.first, out);
160  outputTensors.push_back({it.first,
161  Tensor(runtime->GetOutputTensorInfo(netId, it.first),
162  outputStorage.at(it.first).data())});
163  }
164 
165  // Does the inference.
166  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
167 
168  // Checks the results.
169  for (auto&& it : expectedOutputData)
170  {
171  std::vector<TOutput> out = outputStorage.at(it.first);
172  for (unsigned int i = 0; i < out.size(); ++i)
173  {
174  CHECK_MESSAGE(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true,
175  "Actual output: " << out[i] << ". Expected output:" << it.second[i]);
176 
177  }
178  }
179 }
180 
181 inline void ImportNonAlignedInputPointerTest(std::vector<BackendId> backends)
182 {
183  using namespace armnn;
184 
185  // Create runtime in which test will run
187  IRuntimePtr runtime(armnn::IRuntime::Create(options));
188 
189  // build up the structure of the network
191 
192  IConnectableLayer* input = net->AddInputLayer(0);
193 
194  ActivationDescriptor descriptor;
196  IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
197 
198  IConnectableLayer* output = net->AddOutputLayer(0);
199 
200  input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
201  pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
202 
203  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
204  pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
205 
206  // Optimize the network
207  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
208  CHECK(optNet);
209 
210  // Loads it into the runtime.
211  NetworkId netId;
212  std::string ignoredErrorMessage;
213  // Enable Importing
215  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
216 
217  // Creates structures for input & output
218  std::vector<float> inputData
219  {
220  1.0f, 2.0f, 3.0f, 4.0f
221  };
222 
223  // Misaligned input
224  float* misalignedInputData = reinterpret_cast<float*>(reinterpret_cast<char*>(inputData.data()) + 1);
225 
226  std::vector<float> outputData(4);
227 
228  // Aligned output
229  float* alignedOutputData = outputData.data();
230 
231  InputTensors inputTensors
232  {
233  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputData)},
234  };
235  OutputTensors outputTensors
236  {
237  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputData)}
238  };
239 
240  runtime->GetProfiler(netId)->EnableProfiling(true);
241 
242  // Do the inference and expect it to fail with a ImportMemoryException
243  CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
244 }
245 
246 inline void ExportNonAlignedOutputPointerTest(std::vector<BackendId> backends)
247 {
248  using namespace armnn;
249 
250  // Create runtime in which test will run
252  IRuntimePtr runtime(armnn::IRuntime::Create(options));
253 
254  // build up the structure of the network
256 
257  IConnectableLayer* input = net->AddInputLayer(0);
258 
259  ActivationDescriptor descriptor;
261  IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
262 
263  IConnectableLayer* output = net->AddOutputLayer(0);
264 
265  input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
266  pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
267 
268  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
269  pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
270 
271  // Optimize the network
272  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
273  CHECK(optNet);
274 
275  // Loads it into the runtime.
276  NetworkId netId;
277  std::string ignoredErrorMessage;
278  // Enable Importing and Exporting
280  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
281 
282  // Creates structures for input & output
283  std::vector<float> inputData
284  {
285  1.0f, 2.0f, 3.0f, 4.0f, 5.0f
286  };
287 
288  // Aligned input
289  float* alignedInputData = inputData.data();
290 
291  std::vector<float> outputData(5);
292 
293  // Misaligned output
294  float* misalignedOutputData = reinterpret_cast<float*>(reinterpret_cast<char*>(outputData.data()) + 1);
295 
296  InputTensors inputTensors
297  {
298  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputData)},
299  };
300  OutputTensors outputTensors
301  {
302  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputData)}
303  };
304 
305  // Do the inference and expect it to fail with a ExportMemoryException
306  if (backends[0] == Compute::CpuAcc)
307  {
308  // For CpuAcc the NeonTensorHandle will throw its own exception on misaligned memory
309  CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
310  }
311  else
312  {
313  CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryExportException);
314  }
315 }
316 
317 inline void ImportAlignedPointerTest(std::vector<BackendId> backends)
318 {
319  using namespace armnn;
320 
321  // Create runtime in which test will run
323  IRuntimePtr runtime(armnn::IRuntime::Create(options));
324 
325  // build up the structure of the network
327 
328  IConnectableLayer* input = net->AddInputLayer(0);
329 
330  ActivationDescriptor descriptor;
332  IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
333 
334  IConnectableLayer* output = net->AddOutputLayer(0);
335 
336  input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
337  pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
338 
339  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
340  pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
341 
342  // Optimize the network
343  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
344  CHECK(optNet);
345 
346  // Loads it into the runtime.
347  NetworkId netId;
348  std::string ignoredErrorMessage;
349  // Enable Importing
351  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
352 
353  // Creates structures for input & output
354  std::vector<float> inputData
355  {
356  1.0f, 2.0f, 3.0f, 4.0f
357  };
358 
359  std::vector<float> outputData(4);
360 
361  std::vector<float> expectedOutput
362  {
363  1.0f, 4.0f, 9.0f, 16.0f
364  };
365 
366  InputTensors inputTensors
367  {
368  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
369  };
370  OutputTensors outputTensors
371  {
372  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
373  };
374 
375  runtime->GetProfiler(netId)->EnableProfiling(true);
376 
377  // Do the inference
378  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
379 
380  // Retrieve the Profiler.Print() output to get the workload execution
382  std::stringstream ss;
383  profilerManager.GetProfiler()->Print(ss);
384  std::string dump = ss.str();
385 
386  // Contains ActivationWorkload
387  std::size_t found = dump.find("ActivationWorkload");
388  CHECK(found != std::string::npos);
389 
390  // Contains SyncMemGeneric
391  found = dump.find("SyncMemGeneric");
392  CHECK(found != std::string::npos);
393 
394  // Does not contain CopyMemGeneric
395  found = dump.find("CopyMemGeneric");
396  CHECK(found == std::string::npos);
397 
398  // Check output is as expected
399  CHECK(outputData == expectedOutput);
400 }
401 
402 inline void ImportOnlyWorkload(std::vector<BackendId> backends)
403 {
404  using namespace armnn;
405 
407  IRuntimePtr runtime(IRuntime::Create(options));
408 
409  // Builds up the structure of the network.
411 
412  IConnectableLayer* input = net->AddInputLayer(0);
413 
414  ActivationDescriptor descriptor;
416  IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
417 
418  IConnectableLayer* output = net->AddOutputLayer(0);
419 
420  input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
421  pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
422 
423  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
424  pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
425 
426  // optimize the network
427  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
428 
429  INFO("Load Network");
430  // Load it into the runtime. It should pass.
431  NetworkId netId;
432  std::string ignoredErrorMessage;
433 
435 
436  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
437  == Status::Success);
438 
439  INFO("Generate Data");
440  // Creates structures for input & output
441  std::vector<float> inputData
442  {
443  1.0f, 2.0f, 3.0f, 4.0f
444  };
445 
446  std::vector<float> outputData(4);
447 
448  std::vector<float> expectedOutput
449  {
450  1.0f, 4.0f, 9.0f, 16.0f
451  };
452 
453  INFO("Create Inference");
454 
455  InputTensors inputTensors
456  {
457  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
458  };
459  OutputTensors outputTensors
460  {
461  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
462  };
463 
464  INFO("Get Profiler");
465  runtime->GetProfiler(netId)->EnableProfiling(true);
466 
467  INFO("Run Inference");
468  // Do the inference
469  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
470 
471  INFO("Print Profiler");
472  // Retrieve the Profiler.Print() output to get the workload execution
474  std::stringstream ss;
475  profilerManager.GetProfiler()->Print(ss);
476  std::string dump = ss.str();
477 
478  // Check there are no SyncMemGeneric workloads as we didn't export
479  INFO("Find SyncMemGeneric");
480  int count = SubStringCounter(dump, "SyncMemGeneric");
481  CHECK(count == 0);
482 
483  // Should only be 1 CopyMemGeneric for the output as we imported
484  INFO("Find CopyMemGeneric");
485  count = SubStringCounter(dump, "CopyMemGeneric");
486  CHECK(count == 1);
487 
488  // Check the output is correct
489  CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
490 }
491 
492 inline void ExportOnlyWorkload(std::vector<BackendId> backends)
493 {
494  using namespace armnn;
495 
497  IRuntimePtr runtime(IRuntime::Create(options));
498 
499  // Builds up the structure of the network.
501 
502  IConnectableLayer* input = net->AddInputLayer(0);
503 
504  ActivationDescriptor descriptor;
506  IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
507 
508  IConnectableLayer* output = net->AddOutputLayer(0);
509 
510  input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
511  pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
512 
513  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
514  pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
515 
516  // optimize the network
517  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
518 
519  INFO("Load Network");
520  // Load it into the runtime. It should pass.
521  NetworkId netId;
522  std::string ignoredErrorMessage;
524  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
525  == Status::Success);
526 
527  INFO("Generate Data");
528  // Creates structures for input & output
529  std::vector<float> inputData
530  {
531  1.0f, 2.0f, 3.0f, 4.0f
532  };
533 
534  std::vector<float> outputData(4);
535 
536  std::vector<float> expectedOutput
537  {
538  1.0f, 4.0f, 9.0f, 16.0f
539  };
540 
541  INFO("Create Inference");
542 
543  InputTensors inputTensors
544  {
545  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
546  };
547  OutputTensors outputTensors
548  {
549  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
550  };
551 
552  INFO("Get Profiler");
553  runtime->GetProfiler(netId)->EnableProfiling(true);
554 
555  INFO("Run Inference");
556  // Do the inference
557  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
558 
559  INFO("Print Profiler");
560  // Retrieve the Profiler.Print() output to get the workload execution
562  std::stringstream ss;
563  profilerManager.GetProfiler()->Print(ss);
564  std::string dump = ss.str();
565 
566  // Check there is a SyncMemGeneric workload as we exported
567  INFO("Find SyncMemGeneric");
568  int count = SubStringCounter(dump, "SyncMemGeneric");
569  CHECK(count == 1);
570 
571  // Should be 1 CopyMemGeneric for the output as we did not import
572  INFO("Find CopyMemGeneric");
573  count = SubStringCounter(dump, "CopyMemGeneric");
574  CHECK(count == 1);
575 
576  // Check the output is correct
577  CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
578 }
579 
580 inline void ImportAndExportWorkload(std::vector<BackendId> backends)
581 {
582  using namespace armnn;
583 
585  IRuntimePtr runtime(IRuntime::Create(options));
586 
587  // Builds up the structure of the network.
589 
590  IConnectableLayer* input = net->AddInputLayer(0);
591 
592  ActivationDescriptor descriptor;
594  IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
595 
596  IConnectableLayer* output = net->AddOutputLayer(0);
597 
598  input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
599  pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
600 
601  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
602  pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
603 
604  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
605 
606  INFO("Load Network");
607  // Load it into the runtime. It should pass.
608  NetworkId netId;
609  std::string ignoredErrorMessage;
610 
612 
613  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
614  == Status::Success);
615 
616  INFO("Generate Data");
617  // Creates structures for input & output
618  std::vector<float> inputData
619  {
620  1.0f, 2.0f, 3.0f, 4.0f
621  };
622 
623  std::vector<float> outputData(4);
624 
625  std::vector<float> expectedOutput
626  {
627  1.0f, 4.0f, 9.0f, 16.0f
628  };
629 
630  INFO("Create inference");
631 
632  InputTensors inputTensors
633  {
634  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
635  };
636  OutputTensors outputTensors
637  {
638  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
639  };
640 
641  INFO("Get Profiler");
642  runtime->GetProfiler(netId)->EnableProfiling(true);
643 
644  INFO("Run Inference");
645  // Do the inference
646  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
647 
648  INFO("Print Profiler");
649  // Retrieve the Profiler.Print() output to get the workload execution
651  std::stringstream ss;
652  profilerManager.GetProfiler()->Print(ss);
653  std::string dump = ss.str();
654 
655  // Check there is a SyncMemGeneric workload as we exported
656  INFO("Find SyncMemGeneric");
657  int count = SubStringCounter(dump, "SyncMemGeneric");
658  CHECK(count == 1);
659 
660  // Shouldn't be any CopyMemGeneric workloads
661  INFO("Find CopyMemGeneric");
662  count = SubStringCounter(dump, "CopyMemGeneric");
663  CHECK(count == 0);
664 
665  // Check the output is correct
666  CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
667 }
668 
669 inline void ExportOutputWithSeveralOutputSlotConnectionsTest(std::vector<BackendId> backends)
670 {
671  using namespace armnn;
672 
673  // Create runtime in which test will run
675  IRuntimePtr runtime(armnn::IRuntime::Create(options));
676 
677  // build up the structure of the network
679 
680  IConnectableLayer* input = net->AddInputLayer(0);
681 
682  ActivationDescriptor descriptor;
684  IConnectableLayer* activation = net->AddActivationLayer(descriptor);
685 
686  IConnectableLayer* output0 = net->AddOutputLayer(0);
687  IConnectableLayer* output1 = net->AddOutputLayer(1);
688 
689  input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
690  activation->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
691  activation->GetOutputSlot(0).Connect(output1->GetInputSlot(0));
692 
693  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32, 0.0f, 0, true));
694  activation->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32));
695 
696  // Optimize the network
697  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
698 
699  // Loads it into the runtime.
700  NetworkId netId;
701  std::string ignoredErrorMessage;
702  // Enable Importing
704  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
705 
706  // Creates structures for input & output
707  std::vector<float> inputData
708  {
709  1.0f, 2.0f, 3.0f, 4.0f
710  };
711 
712  std::vector<float> outputData0(4);
713  std::vector<float> outputData1(4);
714 
715  std::vector<float> expectedOutput
716  {
717  1.0f, 4.0f, 9.0f, 16.0f
718  };
719 
720  InputTensors inputTensors
721  {
722  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
723  };
724  OutputTensors outputTensors
725  {
726  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData0.data())},
727  {1,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 1), outputData1.data())}
728  };
729 
730  // The result of the inference is not important, just the fact that there
731  // should not be CopyMemGeneric workloads.
732  runtime->GetProfiler(netId)->EnableProfiling(true);
733 
734  // Do the inference
735  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
736 
737  // Retrieve the Profiler.Print() output to get the workload execution
739  std::stringstream ss;
740  profilerManager.GetProfiler()->Print(ss);
741  std::string dump = ss.str();
742 
743  std::size_t found = std::string::npos;
744 
745  if (backends[0] == Compute::CpuRef)
746  {
747  found = dump.find("RefActivationWorkload");
748  }
749  else if (backends[0] == Compute::CpuAcc)
750  {
751  found = dump.find("NeonActivationWorkload");
752  }
753  else if (backends[0] == Compute::GpuAcc)
754  {
755  found = dump.find("ClActivationWorkload");
756  }
757 
758  CHECK(found != std::string::npos);
759  // No contains SyncMemGeneric
760  found = dump.find("SyncMemGeneric");
761  CHECK(found == std::string::npos);
762  // Contains CopyMemGeneric
763  found = dump.find("CopyMemGeneric");
764  CHECK(found != std::string::npos);
765 
766  // Check that the outputs are correct
767  CHECK(std::equal(outputData0.begin(), outputData0.end(),
768  expectedOutput.begin(), expectedOutput.end()));
769  CHECK(std::equal(outputData1.begin(), outputData1.end(),
770  expectedOutput.begin(), expectedOutput.end()));
771 }
772 
773 inline void StridedSliceInvalidSliceEndToEndTest(std::vector<BackendId> backends)
774 {
775  using namespace armnn;
776 
777  // Create runtime in which test will run
779  IRuntimePtr runtime(armnn::IRuntime::Create(options));
780 
781  // build up the structure of the network
783 
784  IConnectableLayer* input = net->AddInputLayer(0);
785 
786  // Configure a strided slice with a stride the same size as the input but with a ShrinkAxisMask on the first
787  // dim of the output to make it too small to hold the specified slice.
788  StridedSliceDescriptor descriptor;
789  descriptor.m_Begin = {0, 0};
790  descriptor.m_End = {2, 3};
791  descriptor.m_Stride = {1, 1};
792  descriptor.m_BeginMask = 0;
793  descriptor.m_EndMask = 0;
794  descriptor.m_ShrinkAxisMask = 1;
795  IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(descriptor);
796 
797  IConnectableLayer* output0 = net->AddOutputLayer(0);
798 
799  input->GetOutputSlot(0).Connect(stridedSlice->GetInputSlot(0));
800  stridedSlice->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
801 
802  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 2, 3 }, DataType::Float32, 0.0f, 0, true));
803  stridedSlice->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 3 }, DataType::Float32));
804 
805  // Attempt to optimize the network and check that the correct exception is thrown
806  CHECK_THROWS_AS(Optimize(*net, backends, runtime->GetDeviceSpec()), armnn::LayerValidationException);
807 }
808 
809 inline void ForceImportWithAlignedBuffersEndToEndTest(std::vector<BackendId> backends)
810 {
811  /**
812  * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
813  * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
814  * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
815  * In this case all inputs and outputs should be imported
816  */
817  using namespace armnn;
819  IRuntimePtr runtime(IRuntime::Create(options));
820 
821  // Builds up the structure of the network.
823  IConnectableLayer* input = net->AddInputLayer(0);
824  ActivationDescriptor descriptor;
826  IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
827  IConnectableLayer* output = net->AddOutputLayer(0);
828  input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
829  activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
830  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
831  activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
832  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
833  INFO("Load Network");
834 
835  // Load it into the runtime. It should pass.
836  NetworkId netId;
837  std::string ignoredErrorMessage;
839  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
840  == Status::Success);
841  INFO("Generate Data");
842 
843  // Creates structures for input & output
844  std::vector<float> inputData
845  {
846  1.0f, 2.0f, 3.0f, 4.0f
847  };
848  std::vector<float> outputData(4);
849  std::vector<float> expectedOutput
850  {
851  1.0f, 4.0f, 9.0f, 16.0f
852  };
853 
854  // Check our input and output pointers are actually aligned
855  uintptr_t alignment = GetDataTypeSize(DataType::Float32);
856  CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
857  CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
858 
859  INFO("Create Inference");
860  InputTensors inputTensors
861  {
862  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
863  };
864  OutputTensors outputTensors
865  {
866  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
867  };
868 
869  runtime->GetProfiler(netId)->EnableProfiling(true);
870  std::vector<ImportedInputId> importedInputIds =
871  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
872  std::vector<ImportedOutputId> importedOutputIds =
873  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
874  // Do the inference and force the import as the memory is aligned.
875  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
876 
877  // Retrieve the Profiler.Print() output to get the workload execution
879  std::stringstream ss;
880  profilerManager.GetProfiler()->Print(ss);
881  std::string dump = ss.str();
882 
883  if (backends[0] == Compute::CpuAcc)
884  {
885  // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
886  // reconfigure is implemented
887  int count = SubStringCounter(dump, "SyncMemGeneric");
888  CHECK(count == 0);
889  // Should be 2 CopyMemGeneric workloads
890  count = SubStringCounter(dump, "CopyMemGeneric");
891  CHECK(count == 2);
892  }
893  else
894  {
895  // Check there is a SyncMemGeneric workload as we exported
896  int count = SubStringCounter(dump, "SyncMemGeneric");
897  CHECK(count == 1);
898  // Shouldn't be any CopyMemGeneric workloads
899  count = SubStringCounter(dump, "CopyMemGeneric");
900  CHECK(count == 0);
901  }
902  // Check the output is correct
903  CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
904 }
905 
906 inline void ForceImportWithMisalignedInputBuffersEndToEndTest(std::vector<BackendId> backends)
907 {
908  /**
909  * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
910  * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
911  * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
912  * In this case all only the output should be imported
913  */
914  using namespace armnn;
915 
917  IRuntimePtr runtime(IRuntime::Create(options));
918 
919  // Builds up the structure of the network.
921  IConnectableLayer* input = net->AddInputLayer(0);
922 
923  ActivationDescriptor descriptor;
925  IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
926 
927  IConnectableLayer* output = net->AddOutputLayer(0);
928 
929  input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
930  activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
931  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
932  activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
933 
934  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
935  INFO("Load Network");
936  // Load it into the runtime. It should pass.
937  NetworkId netId;
938  std::string ignoredErrorMessage;
940  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
941  == Status::Success);
942  INFO("Generate Data");
943 
944  // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
945  // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
946  auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
947 
948  float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
949 
950  // Check if our pointer is truly misaligned
951  uintptr_t alignment = GetDataTypeSize(DataType::Float32);
952  CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
953 
954  std::vector<float> inputData
955  {
956  1.0f, 2.0f, 3.0f, 4.0f
957  };
958 
959  std::memcpy(misalignedMemPtr, inputData.data(), 4*sizeof(float));
960 
961  std::vector<float> outputData(4);
962  // Check our output buffer is aligned
963  CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
964 
965  std::vector<float> expectedOutput
966  {
967  1.0f, 4.0f, 9.0f, 16.0f
968  };
969 
970  INFO("Create Inference");
971  InputTensors inputTensors
972  {
973  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedMemPtr)},
974  };
975  OutputTensors outputTensors
976  {
977  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
978  };
979  runtime->GetProfiler(netId)->EnableProfiling(true);
980  std::vector<ImportedInputId> importedInputIds =
981  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
982  std::vector<ImportedOutputId> importedOutputIds =
983  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
984 
985  // Do the inference and force the import as the memory is misaligned.
986  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
987 
988  // Retrieve the Profiler.Print() output to get the workload execution
990  std::stringstream ss;
991  profilerManager.GetProfiler()->Print(ss);
992  std::string dump = ss.str();
993 
994  // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
995  // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
996  // for imports/copies. Only that the output is correct.
997  if (backends[0] != Compute::GpuAcc)
998  {
999  if (backends[0] == Compute::CpuAcc)
1000  {
1001  // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1002  // reconfigure is implemented
1003  // We should get 0 SyncMemGeneric for the Output
1004  int count = SubStringCounter(dump, "SyncMemGeneric");
1005  CHECK(count == 0);
1006  // Should be 2 CopyMemGeneric as we copied the input
1007  count = SubStringCounter(dump, "CopyMemGeneric");
1008  CHECK(count == 2);
1009  }
1010  else
1011  {
1012  // We should get 1 SyncMemGeneric for the Output
1013  int count = SubStringCounter(dump, "SyncMemGeneric");
1014  CHECK(count == 1);
1015  // Should only be 1 CopyMemGeneric as we copied the input
1016  count = SubStringCounter(dump, "CopyMemGeneric");
1017  CHECK(count == 1);
1018  }
1019  }
1020  // Check the output is correct
1021  CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1022  std::free(memPtr);
1023 }
1024 
1025 inline void ForceImportWithMisalignedOutputBuffersEndToEndTest(std::vector<BackendId> backends)
1026 {
1027  /**
1028  * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1029  * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1030  * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1031  * In this case all only the input should be imported
1032  */
1033  using namespace armnn;
1034 
1035  IRuntime::CreationOptions options;
1036  IRuntimePtr runtime(IRuntime::Create(options));
1037 
1038  // Builds up the structure of the network.
1040  IConnectableLayer* input = net->AddInputLayer(0);
1041 
1042  ActivationDescriptor descriptor;
1044  IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1045 
1046  IConnectableLayer* output = net->AddOutputLayer(0);
1047 
1048  input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1049  activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1050  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1051  activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1052 
1053  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1054  INFO("Load Network");
1055  // Load it into the runtime. It should pass.
1056  NetworkId netId;
1057  std::string ignoredErrorMessage;
1059  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
1060  == Status::Success);
1061  INFO("Generate Data");
1062 
1063  // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1064  // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1065  auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1066 
1067  float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
1068 
1069  // Check if our pointer is truly misaligned
1070  uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1071  CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
1072 
1073  // Creates structures for input & output
1074  std::vector<float> inputData
1075  {
1076  1.0f, 2.0f, 3.0f, 4.0f
1077  };
1078 
1079  // Check our input buffer is aligned
1080  CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1081  std::vector<float> expectedOutput
1082  {
1083  1.0f, 4.0f, 9.0f, 16.0f
1084  };
1085 
1086  INFO("Create Inference");
1087  InputTensors inputTensors
1088  {
1089  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1090  };
1091  OutputTensors outputTensors
1092  {
1093  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedMemPtr)}
1094  };
1095  runtime->GetProfiler(netId)->EnableProfiling(true);
1096  std::vector<ImportedInputId> importedInputIds =
1097  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1098  std::vector<ImportedOutputId> importedOutputIds =
1099  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1100 
1101  // Do the inference and force the import as the memory is misaligned.
1102  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
1103 
1104  // Retrieve the Profiler.Print() output to get the workload execution
1106  std::stringstream ss;
1107  profilerManager.GetProfiler()->Print(ss);
1108  std::string dump = ss.str();
1109 
1110  // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1111  // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1112  // for imports/copies. Only that the output is correct.
1113  if (backends[0] != Compute::GpuAcc)
1114  {
1115  // Even though we Imported the Input we still shouldn't have a SyncMemGeneric
1116  int count = SubStringCounter(dump, "SyncMemGeneric");
1117  CHECK(count == 0);
1118  // Should only be 1 CopyMemGeneric as we copied the input
1119  count = SubStringCounter(dump, "CopyMemGeneric");
1120  if (backends[0] == Compute::CpuAcc)
1121  {
1122  // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1123  // reconfigure is implemented
1124  CHECK(count == 2);
1125  }
1126  else
1127  {
1128  CHECK(count == 1);
1129  }
1130  // Check the output is correct
1131  }
1132  unsigned int index = 0;
1133  std::vector<float> outputData(expectedOutput.size(), 0);
1134  std::memcpy(outputData.data(), misalignedMemPtr, expectedOutput.size() * sizeof(float));
1135  for (auto outputValue : expectedOutput)
1136  {
1137  CHECK(outputValue == outputData[index]);
1138  ++index;
1139  }
1140  std::free(memPtr);
1141 }
1142 
1143 inline void ForceImportWithMisalignedInputAndOutputBuffersEndToEndTest(std::vector<BackendId> backends)
1144 {
1145  /**
1146  * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1147  * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1148  * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1149  * In this case all inputs and outputs should be copied
1150  */
1151  using namespace armnn;
1152 
1153  IRuntime::CreationOptions options;
1154  IRuntimePtr runtime(IRuntime::Create(options));
1155 
1156  // Builds up the structure of the network.
1158  IConnectableLayer* input = net->AddInputLayer(0);
1159 
1160  ActivationDescriptor descriptor;
1162  IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1163 
1164  IConnectableLayer* output = net->AddOutputLayer(0);
1165 
1166  input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1167  activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1168  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1169  activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1170 
1171  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1172  INFO("Load Network");
1173  // Load it into the runtime. It should pass.
1174  NetworkId netId;
1175  std::string ignoredErrorMessage;
1177  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
1178  == Status::Success);
1179  INFO("Generate Data");
1180 
1181  // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1182  // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1183  auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1184  float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1185 
1186  // Check if our pointer is truly misaligned
1187  uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1188  CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1189  std::vector<float> inputData
1190  {
1191  1.0f, 2.0f, 3.0f, 4.0f
1192  };
1193  std::memcpy(misalignedInputPtr, inputData.data(), 4*sizeof(float));
1194 
1195  auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1196  float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1197 
1198  // Check if our pointer is truly misaligned
1199  CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1200 
1201  std::vector<float> expectedOutput
1202  {
1203  1.0f, 4.0f, 9.0f, 16.0f
1204  };
1205 
1206  INFO("Create Inference");
1207  InputTensors inputTensors
1208  {
1209  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1210  };
1211  OutputTensors outputTensors
1212  {
1213  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1214  };
1215  runtime->GetProfiler(netId)->EnableProfiling(true);
1216  std::vector<ImportedInputId> importedInputIds =
1217  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1218  std::vector<ImportedOutputId> importedOutputIds =
1219  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1220 
1221  // Do the inference and force the import as the memory is misaligned.
1222  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
1223 
1224  // Retrieve the Profiler.Print() output to get the workload execution
1226  std::stringstream ss;
1227  profilerManager.GetProfiler()->Print(ss);
1228  std::string dump = ss.str();
1229 
1230  // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1231  // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1232  // for imports/copies. Only that the output is correct.
1233  if (backends[0] != Compute::GpuAcc)
1234  {
1235  // We can only copy so there should be no SyncMemGeneric
1236  int count = SubStringCounter(dump, "SyncMemGeneric");
1237  CHECK(count == 0);
1238  // Should only be CopyMemGeneric workloads as we copied all buffers
1239  count = SubStringCounter(dump, "CopyMemGeneric");
1240  CHECK(count == 2);
1241  }
1242  // Check the output is correct
1243  unsigned int index = 0;
1244  std::vector<float> outputData(expectedOutput.size(), 0);
1245  std::memcpy(outputData.data(), misalignedOutputPtr, expectedOutput.size() * sizeof(float));
1246  for (auto expectedValue : expectedOutput)
1247  {
1248  CHECK(expectedValue == outputData[index]);
1249  ++index;
1250  }
1251  std::free(inputMemPtr);
1252  std::free(outputMemPtr);
1253 }
1254 
1255 inline void ForceImportRepeatedInferencesEndToEndTest(std::vector<BackendId> backends)
1256 {
1257  /**
1258  * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1259  * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1260  * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1261  * In this we create some aligned buffers, import them into a network and validate the output and number of
1262  * SynMemGeneric/CopyMemgeneric. Then we try the same network again with misaligned buffers to make sure it falls
1263  * back to copying correctly.
1264  */
1265  using namespace armnn;
1266 
1267  IRuntime::CreationOptions options;
1268  IRuntimePtr runtime(IRuntime::Create(options));
1269 
1270  // Builds up the structure of the network.
1272  IConnectableLayer* input = net->AddInputLayer(0);
1273 
1274  ActivationDescriptor descriptor;
1276  IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1277 
1278  IConnectableLayer* output = net->AddOutputLayer(0);
1279 
1280  input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1281  activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1282  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1283  activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1284 
1285  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1286  INFO("Load Network");
1287  // Load it into the runtime. It should pass.
1288  NetworkId netId;
1289  std::string ignoredErrorMessage;
1291  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
1292  == Status::Success);
1293  INFO("Generate Data");
1294 
1295  // Creates structures for input & output
1296  std::vector<float> inputData
1297  {
1298  1.0f, 2.0f, 3.0f, 4.0f
1299  };
1300  std::vector<float> outputData(4);
1301  std::vector<float> expectedOutput
1302  {
1303  1.0f, 4.0f, 9.0f, 16.0f
1304  };
1305 
1306  // Check our input and output pointers are actually aligned
1307  uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1308  CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1309  CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
1310 
1311  INFO("Create Inference");
1312  InputTensors inputTensors
1313  {
1314  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1315  };
1316  OutputTensors outputTensors
1317  {
1318  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1319  };
1320 
1321  runtime->GetProfiler(netId)->EnableProfiling(true);
1322  std::vector<ImportedInputId> importedInputIds =
1323  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1324  std::vector<ImportedOutputId> importedOutputIds =
1325  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1326  // Do the inference and force the import as the memory is aligned.
1327  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
1328 
1329  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1331  std::stringstream ss;
1332  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1333  std::string dump = ss.str();
1334 
1335  if (backends[0] == Compute::CpuAcc)
1336  {
1337  // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1338  // reconfigure is implemented
1339  int count = SubStringCounter(dump, "SyncMemGeneric");
1340  CHECK(count == 0);
1341  // Should be 2 CopyMemGeneric workloads
1342  count = SubStringCounter(dump, "CopyMemGeneric");
1343  CHECK(count >= 1);
1344  }
1345  else
1346  {
1347  // Check there is at least 1 SyncMemGeneric workload as we exported
1348  int count = SubStringCounter(dump, "SyncMemGeneric");
1349  CHECK(count >= 1);
1350  // Shouldn't be any CopyMemGeneric workloads
1351  count = SubStringCounter(dump, "CopyMemGeneric");
1352  CHECK(count == 0);
1353  }
1354  // Check the output is correct
1355  CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1356 
1357  // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1358  // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1359  auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1360  float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1361 
1362  // Check if our pointer is truly misaligned
1363  CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1364 
1365  std::vector<float> inputValues
1366  {
1367  2.0f, 3.0f, 4.0f, 5.0f
1368  };
1369 
1370  std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size()*sizeof(float));
1371 
1372  auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1373  float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1374 
1375  // Check if our pointer is truly misaligned
1376  CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1377 
1378  std::vector<float> expectedMisalignedOutput
1379  {
1380  4.0f, 9.0f, 16.0f, 25.0f
1381  };
1382 
1383  INFO("Create Second Inference");
1384  InputTensors inputTensorsMisaligned
1385  {
1386  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1387  };
1388  OutputTensors outputTensorsMisaligned
1389  {
1390  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1391  };
1392  importedInputIds = runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
1393  importedOutputIds = runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
1394 
1395  // Do the inference and force the import as the memory is misaligned.
1396  runtime->EnqueueWorkload(netId,
1397  inputTensorsMisaligned,
1398  outputTensorsMisaligned,
1399  importedInputIds,
1400  importedOutputIds);
1401 
1402  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1403  // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
1404  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1405  dump = ss.str();
1406 
1407  // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1408  // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1409  // for imports/copies. Only that the output is correct.
1410  if (backends[0] != Compute::GpuAcc)
1411  {
1412  // The SyncMemGeneric will still be in the profiling log from the first inference
1413  int count = SubStringCounter(dump, "SyncMemGeneric");
1414  CHECK(count >= 1);
1415  // We should now see CopyMemGeneric workloads as we copied all buffers
1416  count = SubStringCounter(dump, "CopyMemGeneric");
1417  CHECK(count >= 1);
1418  }
1419  // Check the output is correct
1420  unsigned int index = 0;
1421  std::vector<float> alignedOutputData(expectedMisalignedOutput.size(), 0);
1422  std::memcpy(alignedOutputData.data(), misalignedOutputPtr, expectedMisalignedOutput.size() * sizeof(float));
1423  for (auto outputValue : expectedMisalignedOutput)
1424  {
1425  CHECK(outputValue == alignedOutputData[index]);
1426  ++index;
1427  }
1428  // Clean up to avoid interfering with other tests
1429  runtime->UnloadNetwork(netId);
1430  std::free(inputMemPtr);
1431  std::free(outputMemPtr);
1432 }
1433 
1434 
1435 inline void ForceImportRepeatedInferencesInvertedEndToEndTest(std::vector<BackendId> backends)
1436 {
1437  /**
1438  * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
1439  * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
1440  * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
1441  * In this we create some misaligned buffers, copy them into a network and validate the output and number of
1442  * SynMemGeneric/CopyMemgeneric. Then we try the same network again with aligned buffers to make sure it switches
1443  * to importing correctly.
1444  */
1445  using namespace armnn;
1446 
1447  IRuntime::CreationOptions options;
1448  IRuntimePtr runtime(IRuntime::Create(options));
1449 
1450  // Builds up the structure of the network.
1452  IConnectableLayer* input = net->AddInputLayer(0);
1453 
1454  ActivationDescriptor descriptor;
1456  IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
1457 
1458  IConnectableLayer* output = net->AddOutputLayer(0);
1459 
1460  input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
1461  activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1462  input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
1463  activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
1464 
1465  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
1466  INFO("Load Network");
1467  // Load it into the runtime. It should pass.
1468  NetworkId netId;
1469  std::string ignoredErrorMessage;
1471  CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
1472  == Status::Success);
1473  INFO("Generate Data");
1474 
1475  // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
1476  // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
1477  auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1478  float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
1479 
1480  // Check if our pointer is truly misaligned
1481  uintptr_t alignment = GetDataTypeSize(DataType::Float32);
1482  CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
1483  std::vector<float> inputValues
1484  {
1485  2.0f, 3.0f, 4.0f, 5.0f
1486  };
1487  std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size() * sizeof(float));
1488 
1489  auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
1490  float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
1491 
1492  // Check if our pointer is truly misaligned
1493  CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
1494 
1495  std::vector<float> expectedMisalignedOutput
1496  {
1497  4.0f, 9.0f, 16.0f, 25.0f
1498  };
1499 
1500  INFO("Create Second Inference");
1501  InputTensors inputTensorsMisaligned
1502  {
1503  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
1504  };
1505  OutputTensors outputTensorsMisaligned
1506  {
1507  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
1508  };
1509  runtime->GetProfiler(netId)->EnableProfiling(true);
1510  std::vector<ImportedInputId> importedInputIds =
1511  runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
1512  std::vector<ImportedOutputId> importedOutputIds =
1513  runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
1514 
1515  // Do the inference and force the import as the memory is misaligned.
1516  runtime->EnqueueWorkload(netId,
1517  inputTensorsMisaligned,
1518  outputTensorsMisaligned,
1519  importedInputIds,
1520  importedOutputIds);
1521 
1522  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1524  std::stringstream ss;
1525  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1526  std::string dump = ss.str();
1527 
1528  // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
1529  // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
1530  // for imports/copies. Only that the output is correct.
1531  if (backends[0] != Compute::GpuAcc)
1532  {
1533  // We can only copy so there should be no SyncMemGeneric
1534  int count = SubStringCounter(dump, "SyncMemGeneric");
1535  CHECK(count == 0);
1536  // Should only be CopyMemGeneric workloads as we copied all buffers
1537  count = SubStringCounter(dump, "CopyMemGeneric");
1538  CHECK(count >= 1);
1539  }
1540  // Check the output is correct
1541  unsigned int index = 0;
1542  std::vector<float> alignedOutput(expectedMisalignedOutput.size());
1543  std::memcpy(alignedOutput.data(), misalignedOutputPtr, expectedMisalignedOutput.size()*sizeof(float));
1544  for (auto outputValue : expectedMisalignedOutput)
1545  {
1546  CHECK(outputValue == alignedOutput[index]);
1547  ++index;
1548  }
1549  std::free(inputMemPtr);
1550  std::free(outputMemPtr);
1551 
1552  // Creates structures for input & output
1553  std::vector<float> inputData
1554  {
1555  1.0f, 2.0f, 3.0f, 4.0f
1556  };
1557  std::vector<float> outputData(4);
1558  std::vector<float> expectedOutput
1559  {
1560  1.0f, 4.0f, 9.0f, 16.0f
1561  };
1562 
1563  // Check our input and output pointers are actually aligned
1564  CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
1565  CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
1566 
1567  INFO("Create Inference");
1568  InputTensors inputTensors
1569  {
1570  {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
1571  };
1572  OutputTensors outputTensors
1573  {
1574  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
1575  };
1576 
1577  importedInputIds = runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
1578  importedOutputIds = runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
1579  // Do the inference and force the import as the memory is aligned.
1580  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
1581 
1582  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1583  // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
1584  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1585  dump = ss.str();
1586 
1587  if (backends[0] == Compute::CpuAcc)
1588  {
1589  // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
1590  // reconfigure is implemented
1591  int count = SubStringCounter(dump, "SyncMemGeneric");
1592  CHECK(count == 0);
1593  // Should be 2 CopyMemGeneric workloads
1594  count = SubStringCounter(dump, "CopyMemGeneric");
1595  CHECK(count >= 1);
1596  }
1597  else
1598  {
1599  // Repeated inferences make it difficult to check for an accurate count. So we just validate that we have a
1600  // SyncMemGeneric Workload when we previously didn't
1601  int count = SubStringCounter(dump, "SyncMemGeneric");
1602  CHECK(count >= 1);
1603  // Should still be some CopyMemGeneric Workloads from the last inference
1604  count = SubStringCounter(dump, "CopyMemGeneric");
1605  CHECK(count >= 1);
1606  }
1607  // Check the output is correct
1608  CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
1609  // Clean up to avoid interfering with other tests
1610  runtime->UnloadNetwork(netId);
1611 }
1612 
1613 } // anonymous namespace
static IRuntimePtr Create(const CreationOptions &options)
Definition: Runtime.cpp:49
Interface for a layer that is connectable to other layers via InputSlots and OutputSlots.
Definition: INetwork.hpp:66
CPU Execution: Reference C++ kernels.
int32_t m_ShrinkAxisMask
Shrink axis mask value. If set, the nth specification shrinks the dimensionality by 1...
static ProfilerManager & GetInstance()
Definition: Profiling.cpp:572
std::vector< int > m_Begin
Begin values for the input that will be sliced.
std::unique_ptr< IRuntime, void(*)(IRuntime *runtime)> IRuntimePtr
Definition: IRuntime.hpp:33
void Print(std::ostream &outStream) const
Print stats for events in JSON Format to the given output stream.
Definition: Profiling.cpp:609
typename ResolveTypeImpl< DT >::Type ResolveType
Definition: ResolveType.hpp:79
std::vector< std::pair< LayerBindingId, class ConstTensor > > InputTensors
Definition: Tensor.hpp:392
void AnalyzeEventsAndWriteResults(std::ostream &outStream) const
Analyzes the tracked events and writes the results to the given output stream.
Definition: Profiling.cpp:604
Copyright (c) 2021 ARM Limited and Contributors.
int32_t m_BeginMask
Begin mask value.
int32_t m_EndMask
End mask value.
virtual void SetTensorInfo(const TensorInfo &tensorInfo)=0
IProfiler * GetProfiler()
Definition: Profiling.cpp:584
A tensor defined by a TensorInfo (shape and data type) and a mutable backing store.
Definition: Tensor.hpp:319
DataType
Definition: Types.hpp:48
IOptimizedNetworkPtr Optimize(const INetwork &network, const std::vector< BackendId > &backendPreferences, const IDeviceSpec &deviceSpec, const OptimizerOptions &options=OptimizerOptions(), Optional< std::vector< std::string > &> messages=EmptyOptional())
Create an optimized version of the network.
Definition: Network.cpp:1847
int NetworkId
Definition: IRuntime.hpp:27
A tensor defined by a TensorInfo (shape and data type) and an immutable backing store.
Definition: Tensor.hpp:327
std::vector< std::pair< LayerBindingId, class Tensor > > OutputTensors
Definition: Tensor.hpp:393
std::unique_ptr< IOptimizedNetwork, void(*)(IOptimizedNetwork *network)> IOptimizedNetworkPtr
Definition: INetwork.hpp:242
void SetQuantizationScale(float scale)
Definition: Tensor.cpp:473
GPU Execution: OpenCL: ArmCompute.
std::vector< int > m_Stride
Stride values for the input that will be sliced.
An ActivationDescriptor for the ActivationLayer.
Definition: Descriptors.hpp:36
std::vector< int > m_End
End values for the input that will be sliced.
CPU Execution: NEON: ArmCompute.
virtual const IInputSlot & GetInputSlot(unsigned int index) const =0
Get a const input slot handle by slot index.
void SetConstant(const bool IsConstant=true)
Marks the data corresponding to this tensor info as constant.
Definition: Tensor.cpp:514
A StridedSliceDescriptor for the StridedSliceLayer.
virtual const IOutputSlot & GetOutputSlot(unsigned int index) const =0
Get the const output slot handle by slot index.
void SetQuantizationOffset(int32_t offset)
Definition: Tensor.cpp:489
std::unique_ptr< INetwork, void(*)(INetwork *network)> INetworkPtr
Definition: INetwork.hpp:241
virtual int Connect(IInputSlot &destination)=0
static INetworkPtr Create(NetworkOptions networkOptions={})
Definition: Network.cpp:476
ActivationFunction m_Function
The activation function to use (Sigmoid, TanH, Linear, ReLu, BoundedReLu, SoftReLu, LeakyReLu, Abs, Sqrt, Square, Elu).
Definition: Descriptors.hpp:59
constexpr unsigned int GetDataTypeSize(DataType dataType)
Definition: TypesUtils.hpp:151