ArmNN
 22.05
ClImportTensorHandleTests.cpp
Go to the documentation of this file.
1 //
2 // Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
7 
11 
12 #include <doctest/doctest.h>
13 
14 #include <armnn/IRuntime.hpp>
15 #include <armnn/INetwork.hpp>
16 #include "Network.hpp"
17 
18 using namespace armnn;
19 
20 TEST_SUITE("ClImportTensorHandleTests")
21 {
23 {
24  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
25  static_cast<MemorySourceFlags>(MemorySource::Malloc));
26 
27  TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
28  unsigned int numElements = info.GetNumElements();
29 
30  // create TensorHandle for memory import
31  auto handle = handleFactory.CreateTensorHandle(info);
32 
33  // Get CLtensor
34  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
35 
36  // Create and configure activation function
37  const arm_compute::ActivationLayerInfo act_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
38  arm_compute::CLActivationLayer act_func;
39  act_func.configure(&tensor, nullptr, act_info);
40 
41  // Allocate user memory
42  const size_t totalBytes = tensor.info()->total_size();
43  const size_t alignment =
44  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
45  size_t space = totalBytes + alignment + alignment;
46  auto testData = std::make_unique<uint8_t[]>(space);
47  void* alignedPtr = testData.get();
48  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
49 
50  // Import memory
51  CHECK(handle->Import(alignedPtr, armnn::MemorySource::Malloc));
52 
53  // Input with negative values
54  auto* typedPtr = reinterpret_cast<float*>(alignedPtr);
55  std::fill_n(typedPtr, numElements, -5.0f);
56 
57  // Execute function and sync
58  act_func.run();
59  arm_compute::CLScheduler::get().sync();
60 
61  // Validate result by checking that the output has no negative values
62  for(unsigned int i = 0; i < numElements; ++i)
63  {
64  CHECK(typedPtr[i] == 0);
65  }
66 }
67 
68 TEST_CASE_FIXTURE(ClContextControlFixture, "ClIncorrectMemorySourceImport")
69 {
70  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
71  static_cast<MemorySourceFlags>(MemorySource::Malloc));
72 
73  TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
74 
75  // create TensorHandle for memory import
76  auto handle = handleFactory.CreateTensorHandle(info);
77 
78  // Get CLtensor
79  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
80 
81  // Allocate user memory
82  const size_t totalBytes = tensor.info()->total_size();
83  const size_t alignment =
84  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
85  size_t space = totalBytes + alignment + alignment;
86  auto testData = std::make_unique<uint8_t[]>(space);
87  void* alignedPtr = testData.get();
88  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
89 
90  // Import memory
91  CHECK_THROWS_AS(handle->Import(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
92 }
93 
94 TEST_CASE_FIXTURE(ClContextControlFixture, "ClInvalidMemorySourceImport")
95 {
96  MemorySource invalidMemSource = static_cast<MemorySource>(256);
97  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(invalidMemSource),
98  static_cast<MemorySourceFlags>(invalidMemSource));
99 
100  TensorInfo info({ 1, 2, 2, 1 }, DataType::Float32);
101 
102  // create TensorHandle for memory import
103  auto handle = handleFactory.CreateTensorHandle(info);
104 
105  // Allocate user memory
106  std::vector<float> inputData
107  {
108  1.0f, 2.0f, 3.0f, 4.0f
109  };
110 
111  // Import non-support memory
112  CHECK_THROWS_AS(handle->Import(inputData.data(), invalidMemSource), MemoryImportException);
113 }
114 
115 TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd")
116 {
117  // Create runtime in which test will run
119  IRuntimePtr runtime(armnn::IRuntime::Create(options));
120 
121  // build up the structure of the network
123 
124  IConnectableLayer* input = net->AddInputLayer(0, "Input");
125 
126  ActivationDescriptor descriptor;
128  IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation");
129 
130  IConnectableLayer* output = net->AddOutputLayer(0, "Output");
131 
132  input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
133  activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
134 
135  TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32);
136  unsigned int numElements = tensorInfo.GetNumElements();
137  size_t totalBytes = numElements * sizeof(float);
138 
139  input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
140  activation->GetOutputSlot(0).SetTensorInfo(tensorInfo);
141 
142  // Optimize the network
143  OptimizerOptions optOptions;
144  optOptions.m_ImportEnabled = true;
145  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
146  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
147  CHECK(optNet);
148 
149  // Loads it into the runtime.
150  NetworkId netId;
151  std::string ignoredErrorMessage;
152  // Enable Importing
154  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
155 
156  // Creates structures for input & output
157  const size_t alignment =
158  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
159  size_t space = totalBytes + alignment + alignment;
160  auto inputData = std::make_unique<uint8_t[]>(space);
161  void* alignedInputPtr = inputData.get();
162  CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
163 
164  // Input with negative values
165  auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
166  std::fill_n(intputPtr, numElements, -5.0f);
167 
168  auto outputData = std::make_unique<uint8_t[]>(space);
169  void* alignedOutputPtr = outputData.get();
170  CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
171  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
172  std::fill_n(outputPtr, numElements, -10.0f);
173 
174  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
175  inputTensorInfo.SetConstant(true);
176  InputTensors inputTensors
177  {
178  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
179  };
180  OutputTensors outputTensors
181  {
182  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
183  };
184 
185  runtime->GetProfiler(netId)->EnableProfiling(true);
186 
187  // Do the inference
188  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
189 
190  // Retrieve the Profiler.Print() output to get the workload execution
192  std::stringstream ss;
193  profilerManager.GetProfiler()->Print(ss);;
194  std::string dump = ss.str();
195 
196  // Contains ActivationWorkload
197  std::size_t found = dump.find("ActivationWorkload");
198  CHECK(found != std::string::npos);
199 
200  // Contains SyncMemGeneric
201  found = dump.find("SyncMemGeneric");
202  CHECK(found != std::string::npos);
203 
204  // Does not contain CopyMemGeneric
205  found = dump.find("CopyMemGeneric");
206  CHECK(found == std::string::npos);
207 
208  runtime->UnloadNetwork(netId);
209 
210  // Check output is as expected
211  // Validate result by checking that the output has no negative values
212  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
213  CHECK(outputResult);
214  for(unsigned int i = 0; i < numElements; ++i)
215  {
216  CHECK(outputResult[i] >= 0);
217  }
218 }
219 
220 TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported")
221 {
222  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
223  static_cast<MemorySourceFlags>(MemorySource::Malloc));
224 
225  TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
226 
227  // create TensorHandle for memory import
228  auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
229 
230  // Get CLtensor
231  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
232 
233  // Allocate user memory
234  const size_t totalBytes = tensor.info()->total_size();
235  const size_t alignment =
236  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
237  size_t space = totalBytes + alignment + alignment;
238  auto testData = std::make_unique<uint8_t[]>(space);
239  void* alignedPtr = testData.get();
240  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
241 
242  // Import memory
243  CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
244 
245 }
246 
247 TEST_CASE("ClCanBeImportedAlignedMemory")
248 {
249  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
250  static_cast<MemorySourceFlags>(MemorySource::Malloc));
251 
252  TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32);
253 
254  // create TensorHandle (Memory Managed status is irrelevant)
255  auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
256  // Get CLtensor
257  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
258 
259  // Create an aligned buffer
260  const size_t totalBytes = tensor.info()->total_size();
261  const size_t alignment =
262  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
263  size_t space = totalBytes + alignment + alignment;
264  auto testData = std::make_unique<uint8_t[]>(space);
265  void* alignedPtr = testData.get();
266  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
267 
268  // Check aligned buffers return true
269  CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true);
270 
271  // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu
272  // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail.
273  // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true
274  // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
275 }
276 
277 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
278 {
279  // Create runtime in which test will run
281  IRuntimePtr runtime(armnn::IRuntime::Create(options));
282 
283  // build up the structure of the network
284  INetworkPtr network(INetwork::Create());
285 
286  armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
287  armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
288  armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
289 
290  kernelInfo.SetConstant(true);
291 
292  std::vector<float> kernel =
293  {
294  4, 5, 6,
295  0, 0, 0,
296  3, 2, 1
297  };
298 
299  const std::vector<float> expectedOutput =
300  {
301  23, 41, 33, 21,
302  44, 65, 76, 52,
303  82, 85, 79, 42
304  };
305 
306  unsigned int numElements = inputInfo.GetNumElements();
307  size_t totalBytes = numElements * sizeof(float);
308 
309  IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
310  ARMNN_ASSERT(inputLayer);
311 
312  armnn::ConstTensor weights(kernelInfo, kernel);
313 
315  convDesc2d.m_StrideX = 1;
316  convDesc2d.m_StrideY = 1;
317  convDesc2d.m_PadLeft = 1;
318  convDesc2d.m_PadRight = 1;
319  convDesc2d.m_PadTop = 1;
320  convDesc2d.m_PadBottom = 1;
321  convDesc2d.m_DataLayout = DataLayout::NHWC;
322 
324  armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
325  weights,
327  "conv");
329  ARMNN_ASSERT(convLayer);
330 
331  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
332  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
333 
334  IConnectableLayer* output = network->AddOutputLayer(0, "output");
335  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
336  convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
337 
338  // Optimize the network
339  OptimizerOptions optOptions;
340  optOptions.m_ImportEnabled = false;
341  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
342  IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
343  CHECK(optNet);
344 
345  // Loads it into the runtime.
346  NetworkId netId;
347  std::string ignoredErrorMessage;
348  // Enable Importing
350  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
351 
352  // Creates structures for input & output
353  const size_t alignment =
354  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
355  size_t space = totalBytes + alignment + alignment;
356  auto inputData = std::make_unique<uint8_t[]>(space);
357  void* alignedInputPtr = inputData.get();
358  CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
359 
360  // Input with negative values
361  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
362  inputPtr[0] = 1;
363  inputPtr[1] = 5;
364  inputPtr[2] = 2;
365  inputPtr[3] = 3;
366  inputPtr[4] = 8;
367  inputPtr[5] = 7;
368  inputPtr[6] = 3;
369  inputPtr[7] = 6;
370  inputPtr[8] = 3;
371  inputPtr[9] = 3;
372  inputPtr[10] = 9;
373  inputPtr[11] = 1;
374 
375 
376  auto outputData = std::make_unique<uint8_t[]>(space);
377  void* alignedOutputPtr = outputData.get();
378  CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
379  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
380  std::fill_n(outputPtr, numElements, -10.0f);
381 
382  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
383  inputTensorInfo.SetConstant(true);
384  InputTensors inputTensors
385  {
386  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
387  };
388  OutputTensors outputTensors
389  {
390  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
391  };
392 
393  runtime->GetProfiler(netId)->EnableProfiling(true);
394 
395  INFO("Run ImportInputs");
396  std::vector<ImportedInputId> importedInputIds =
397  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
398  std::vector<ImportedOutputId> importedOutputIds =
399  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
400 
401  // Do the inference
402  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
403 
404  // Retrieve the Profiler.Print() output to get the workload execution
406  std::stringstream ss;
407  profilerManager.GetProfiler()->Print(ss);;
408  std::string dump = ss.str();
409 
410  // Contains Convolution2dWorkload
411  std::size_t found = dump.find("Convolution2dWorkload");
412  CHECK(found != std::string::npos);
413 
414  // Contains SyncMemGeneric
415  found = dump.find("SyncMemGeneric");
416  CHECK(found != std::string::npos);
417 
418  // Does not contain CopyMemGeneric
419  found = dump.find("CopyMemGeneric");
420  CHECK(found == std::string::npos);
421 
422  runtime->UnloadNetwork(netId);
423 
424  // Check output is as expected
425  // Validate result by checking that the output has no negative values
426  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
427  CHECK(outputResult);
428 
429  // Check the output is correct
430  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
431 }
432 
433 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd")
434 {
435  using namespace half_float::literal;
436 
437  // Create runtime in which test will run
439  IRuntimePtr runtime(armnn::IRuntime::Create(options));
440 
441  // build up the structure of the network
442  NetworkImpl network;
443 
444  armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16);
445  armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
446 
447  std::vector<float> expectedOutput =
448  {
449  -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
450  1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
451  };
452 
453  unsigned int numElements = inputInfo.GetNumElements();
454  size_t totalBytesInput = numElements * sizeof(Half);
455  size_t totalBytesOutput = numElements * sizeof(float);
456 
457  IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
458  ARMNN_ASSERT(inputLayer);
459 
460  armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert");
461  ARMNN_ASSERT(convLayer);
462 
463  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
464  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
465 
466  IConnectableLayer* output = network.AddOutputLayer(0, "output");
467  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
468  convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
469 
470  // Optimize the network
471  OptimizerOptions optOptions;
472  optOptions.m_ImportEnabled = false;
473  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
474  IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
475  CHECK(optNet);
476 
477  // Loads it into the runtime.
478  NetworkId netId;
479  std::string ignoredErrorMessage;
480  // Enable Importing
482  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
483 
484  // Creates structures for input & output
485  const size_t alignment =
486  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
487  size_t spaceInput = totalBytesInput + alignment + alignment;
488  size_t spaceOutput = totalBytesOutput + alignment + alignment;
489  auto inputData = std::make_unique<uint8_t[]>(spaceInput);
490  void* alignedInputPtr = inputData.get();
491  CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
492 
493  // Input with negative values
494  auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr);
495  inputPtr[0] = -37.5_h;
496  inputPtr[1] = -15.2_h;
497  inputPtr[2] = -8.76_h;
498  inputPtr[3] = -2.0_h;
499  inputPtr[4] = -1.5_h;
500  inputPtr[5] = -1.3_h;
501  inputPtr[6] = -0.5_h;
502  inputPtr[7] = -0.4_h;
503  inputPtr[8] = 0.0_h;
504  inputPtr[9] = 1.0_h;
505  inputPtr[10] = 0.4_h;
506  inputPtr[11] = 0.5_h;
507  inputPtr[12] = 1.3_h;
508  inputPtr[13] = 1.5_h;
509  inputPtr[14] = 2.0_h;
510  inputPtr[15] = 8.76_h;
511  inputPtr[16] = 15.2_h;
512  inputPtr[17] = 37.5_h;
513 
514  auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
515  void* alignedOutputPtr = outputData.get();
516  CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
517  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
518  std::fill_n(outputPtr, numElements, -10.0f);
519 
520  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
521  inputTensorInfo.SetConstant(true);
522  InputTensors inputTensors
523  {
524  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
525  };
526  OutputTensors outputTensors
527  {
528  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
529  };
530 
531  runtime->GetProfiler(netId)->EnableProfiling(true);
532 
533  INFO("Run ImportInputs");
534  std::vector<ImportedInputId> importedInputIds =
535  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
536  std::vector<ImportedOutputId> importedOutputIds =
537  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
538 
539  // Do the inference
540  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
541 
542  // Retrieve the Profiler.Print() output to get the workload execution
544  std::stringstream ss;
545  profilerManager.GetProfiler()->Print(ss);;
546  std::string dump = ss.str();
547 
548  // Contains Convolution2dWorkload
549  std::size_t found = dump.find("ConvertFp16ToFp32Workload");
550  CHECK(found != std::string::npos);
551 
552  // Contains SyncMemGeneric
553  found = dump.find("SyncMemGeneric");
554  CHECK(found != std::string::npos);
555 
556  // Does not contain CopyMemGeneric
557  found = dump.find("CopyMemGeneric");
558  CHECK(found == std::string::npos);
559 
560  runtime->UnloadNetwork(netId);
561 
562  // Check output is as expected
563  // Validate result by checking that the output has no negative values
564  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
565  CHECK(outputResult);
566 
567  // Check the output is correct
568  for (size_t i = 0; i < numElements; ++i)
569  {
570  DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004),
571  "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]);
572  }
573 }
574 
575 
576 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd")
577 {
578  using namespace half_float::literal;
579 
580  // Create runtime in which test will run
582  IRuntimePtr runtime(armnn::IRuntime::Create(options));
583 
584  // build up the structure of the network
585  NetworkImpl network;
586 
587  armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32);
588  armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
589 
590  std::vector<Half> expectedOutput =
591  {
592  -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
593  1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h
594  };
595 
596  unsigned int numElements = inputInfo.GetNumElements();
597  size_t totalBytesInput = numElements * sizeof(float);
598  size_t totalBytesOutput = numElements * sizeof(Half);
599 
600  IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
601  ARMNN_ASSERT(inputLayer);
602 
603  armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
604  ARMNN_ASSERT(convLayer);
605 
606  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
607  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
608 
609  IConnectableLayer* output = network.AddOutputLayer(0, "output");
610  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
611  convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
612 
613  // Optimize the network
614  OptimizerOptions optOptions;
615  optOptions.m_ImportEnabled = false;
616  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
617  IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
618  CHECK(optNet);
619 
620  // Loads it into the runtime.
621  NetworkId netId;
622  std::string ignoredErrorMessage;
623  // Enable Importing
625  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
626 
627  // Creates structures for input & output
628  const size_t alignment =
629  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
630  size_t spaceInput = totalBytesInput + alignment + alignment;
631  size_t spaceOutput = totalBytesOutput + alignment + alignment;
632  auto inputData = std::make_unique<uint8_t[]>(spaceInput);
633  void* alignedInputPtr = inputData.get();
634  CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
635 
636  // Input with negative values
637  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
638  inputPtr[0] = -37.5f;
639  inputPtr[1] = -15.2f;
640  inputPtr[2] = -8.76f;
641  inputPtr[3] = -2.0f;
642  inputPtr[4] = -1.5f;
643  inputPtr[5] = -1.3f;
644  inputPtr[6] = -0.5f;
645  inputPtr[7] = -0.4f;
646  inputPtr[8] = 0.0f;
647  inputPtr[9] = 1.0f;
648  inputPtr[10] = 0.4f;
649  inputPtr[11] = 0.5f;
650  inputPtr[12] = 1.3f;
651  inputPtr[13] = 1.5f;
652  inputPtr[14] = 2.0f;
653  inputPtr[15] = 8.76f;
654  inputPtr[16] = 15.2f;
655  inputPtr[17] = 37.5f;
656 
657  auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
658  void* alignedOutputPtr = outputData.get();
659  CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
660  auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
661  std::fill_n(outputPtr, numElements, -10.0f);
662 
663  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
664  inputTensorInfo.SetConstant(true);
665  InputTensors inputTensors
666  {
667  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
668  };
669  OutputTensors outputTensors
670  {
671  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
672  };
673 
674  runtime->GetProfiler(netId)->EnableProfiling(true);
675 
676  INFO("Run ImportInputs");
677  std::vector<ImportedInputId> importedInputIds =
678  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
679  std::vector<ImportedOutputId> importedOutputIds =
680  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
681 
682  // Do the inference
683  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
684 
685  // Retrieve the Profiler.Print() output to get the workload execution
687  std::stringstream ss;
688  profilerManager.GetProfiler()->Print(ss);;
689  std::string dump = ss.str();
690 
691  // Contains Convolution2dWorkload
692  std::size_t found = dump.find("ConvertFp32ToFp16Workload");
693  CHECK(found != std::string::npos);
694 
695  // Contains SyncMemGeneric
696  found = dump.find("SyncMemGeneric");
697  CHECK(found != std::string::npos);
698 
699  // Does not contain CopyMemGeneric
700  found = dump.find("CopyMemGeneric");
701  CHECK(found == std::string::npos);
702 
703  runtime->UnloadNetwork(netId);
704 
705  // Check output is as expected
706  // Validate result by checking that the output has no negative values
707  auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
708  CHECK(outputResult);
709 
710  // Check the output is correct
711  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
712 }
713 
714 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd")
715 {
716  using namespace half_float::literal;
717 
718  // Create runtime in which test will run
720  IRuntimePtr runtime(armnn::IRuntime::Create(options));
721 
722  // build up the structure of the network
723  NetworkImpl network;
724 
726  armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16);
727 
728  std::vector<Half> expectedOutput = { 1.0_h };
729 
730  unsigned int numElements = inputInfo.GetNumElements();
731  size_t totalBytesInput = numElements * sizeof(float);
732  size_t totalBytesOutput = numElements * sizeof(Half);
733 
734  IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
735  ARMNN_ASSERT(inputLayer);
736 
737  armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
738  ARMNN_ASSERT(convLayer);
739 
740  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
741  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
742 
743  IConnectableLayer* output = network.AddOutputLayer(0, "output");
744  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
745  convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
746 
747  // Optimize the network
748  OptimizerOptions optOptions;
749  optOptions.m_ImportEnabled = false;
750  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
751  IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
752  CHECK(optNet);
753 
754  // Loads it into the runtime.
755  NetworkId netId;
756  std::string ignoredErrorMessage;
757  // Enable Importing
759  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
760 
761  // Creates structures for input & output
762  const size_t alignment =
763  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
764  size_t spaceInput = totalBytesInput + alignment + alignment;
765  size_t spaceOutput = totalBytesOutput + alignment + alignment;
766  auto inputData = std::make_unique<uint8_t[]>(spaceInput);
767  void* alignedInputPtr = inputData.get();
768  CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
769 
770  // Input with negative values
771  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
772  inputPtr[0] = 1.0f;
773 
774  auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
775  void* alignedOutputPtr = outputData.get();
776  CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
777  auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
778  std::fill_n(outputPtr, numElements, -10.0f);
779 
780  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
781  inputTensorInfo.SetConstant(true);
782  InputTensors inputTensors
783  {
784  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
785  };
786  OutputTensors outputTensors
787  {
788  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
789  };
790 
791  runtime->GetProfiler(netId)->EnableProfiling(true);
792 
793  INFO("Run ImportInputs");
794  std::vector<ImportedInputId> importedInputIds =
795  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
796  std::vector<ImportedOutputId> importedOutputIds =
797  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
798 
799  // Do the inference
800  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
801 
802  // Retrieve the Profiler.Print() output to get the workload execution
804  std::stringstream ss;
805  profilerManager.GetProfiler()->Print(ss);;
806  std::string dump = ss.str();
807 
808  // Contains Convolution2dWorkload
809  std::size_t found = dump.find("ConvertFp32ToFp16Workload");
810  CHECK(found != std::string::npos);
811 
812  // Contains SyncMemGeneric
813  found = dump.find("SyncMemGeneric");
814  CHECK(found != std::string::npos);
815 
816  // Does not contain CopyMemGeneric
817  found = dump.find("CopyMemGeneric");
818  CHECK(found == std::string::npos);
819 
820  runtime->UnloadNetwork(netId);
821 
822  // Check output is as expected
823  // Validate result by checking that the output has no negative values
824  auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
825  CHECK(outputResult);
826 
827  // Check the output is correct
828  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
829 }
830 
831 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest")
832 {
833 /*
834  * This is a test to check the functionality of the Forced Import functionality when using repeated inferences that
835  * require switching from importing to copy. For the first inference we create aligned Pointers and check they are
836  * imported correctly. For the second we use similar pointers but don't use PreImporting to force fall back to copy.
837  */
838  // Create runtime in which test will run
840  IRuntimePtr runtime(armnn::IRuntime::Create(options));
841 
842  // build up the structure of the network
843  INetworkPtr network(INetwork::Create());
844 
845  armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
846  armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
847  armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
848 
849  kernelInfo.SetConstant(true);
850 
851  std::vector<float> kernel =
852  {
853  4, 5, 6,
854  0, 0, 0,
855  3, 2, 1
856  };
857 
858  const std::vector<float> expectedOutput =
859  {
860  23, 41, 33, 21,
861  44, 65, 76, 52,
862  82, 85, 79, 42
863  };
864 
865  unsigned int numElements = inputInfo.GetNumElements();
866  size_t totalBytes = numElements * sizeof(float);
867 
868  IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
869  ARMNN_ASSERT(inputLayer);
870 
871  armnn::ConstTensor weights(kernelInfo, kernel);
872 
874  convDesc2d.m_StrideX = 1;
875  convDesc2d.m_StrideY = 1;
876  convDesc2d.m_PadLeft = 1;
877  convDesc2d.m_PadRight = 1;
878  convDesc2d.m_PadTop = 1;
879  convDesc2d.m_PadBottom = 1;
880  convDesc2d.m_DataLayout = DataLayout::NHWC;
882  armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
883  weights,
885  "conv");
887  ARMNN_ASSERT(convLayer);
888 
889  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
890  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
891 
892  IConnectableLayer* output = network->AddOutputLayer(0, "output");
893  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
894  convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
895 
896  // Optimize the network
897  OptimizerOptions optOptions;
898  optOptions.m_ImportEnabled = false;
899  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
900  IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
901  CHECK(optNet);
902 
903  // Loads it into the runtime.
904  NetworkId netId;
905  std::string ignoredErrorMessage;
906  // Enable Importing
908  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
909 
910  // Creates structures for input & output
911  const size_t alignment =
912  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
913  size_t space = totalBytes + alignment + alignment;
914  auto inputData = std::make_unique<uint8_t[]>(space);
915  void* alignedInputPtr = inputData.get();
916  CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
917 
918  // Fill input with values
919  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
920  inputPtr[0] = 1;
921  inputPtr[1] = 5;
922  inputPtr[2] = 2;
923  inputPtr[3] = 3;
924  inputPtr[4] = 8;
925  inputPtr[5] = 7;
926  inputPtr[6] = 3;
927  inputPtr[7] = 6;
928  inputPtr[8] = 3;
929  inputPtr[9] = 3;
930  inputPtr[10] = 9;
931  inputPtr[11] = 1;
932 
933 
934  auto outputData = std::make_unique<uint8_t[]>(space);
935  void* alignedOutputPtr = outputData.get();
936  CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
937  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
938  std::fill_n(outputPtr, numElements, -10.0f);
939 
940  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
941  inputTensorInfo.SetConstant(true);
942  InputTensors inputTensors
943  {
944  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
945  };
946  OutputTensors outputTensors
947  {
948  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
949  };
950 
951  runtime->GetProfiler(netId)->EnableProfiling(true);
952 
953  INFO("Run ImportInputs");
954  std::vector<ImportedInputId> importedInputIds =
955  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
956  std::vector<ImportedOutputId> importedOutputIds =
957  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
958 
959  // Do the inference
960  runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
961 
962  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
964  std::stringstream ss;
965  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
966  std::string dump = ss.str();
967 
968  // Contains Convolution2dWorkload
969  std::size_t found = dump.find("Convolution2dWorkload");
970  CHECK(found != std::string::npos);
971 
972  // Contains SyncMemGeneric
973  found = dump.find("SyncMemGeneric");
974  CHECK(found != std::string::npos);
975 
976  // Does not contain CopyMemGeneric
977  found = dump.find("CopyMemGeneric");
978  CHECK(found == std::string::npos);
979 
980  // Sync the outputs so we can read the data
981  arm_compute::CLScheduler::get().sync();
982 
983  // Check output is as expected
984  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
985  CHECK(outputResult);
986  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
987 
988  // Repeat the inference, with new tensors and without using PreImporting to force it to fall back to copying
989 
990  // Creates structures for input & output
991  auto inputDataCopy = std::make_unique<uint8_t[]>(space);
992  void* copyInputPtr = inputDataCopy.get();
993 
994  // Fill input with values
995  auto* inputCopyPtr = reinterpret_cast<float*>(copyInputPtr);
996  inputCopyPtr[0] = 1;
997  inputCopyPtr[1] = 5;
998  inputCopyPtr[2] = 2;
999  inputCopyPtr[3] = 3;
1000  inputCopyPtr[4] = 8;
1001  inputCopyPtr[5] = 7;
1002  inputCopyPtr[6] = 3;
1003  inputCopyPtr[7] = 6;
1004  inputCopyPtr[8] = 3;
1005  inputCopyPtr[9] = 3;
1006  inputCopyPtr[10] = 9;
1007  inputCopyPtr[11] = 1;
1008 
1009  // Output pre-filled with -10.0f
1010  auto outputDataCopy = std::make_unique<uint8_t[]>(space);
1011  void* copyOutputPtr = outputDataCopy.get();
1012  auto* outputCopyPtr = reinterpret_cast<float*>(copyOutputPtr);
1013  std::fill_n(outputCopyPtr, numElements, -10.0f);
1014 
1015  InputTensors inputTensorsCopy
1016  {
1017  {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1018  };
1019  OutputTensors outputTensorsCopy
1020  {
1021  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1022  };
1023 
1024  // Do the inference without any pre-imported input/output ids
1025  runtime->EnqueueWorkload(netId, inputTensorsCopy, outputTensorsCopy);
1026  // Sync the outputs so we can read the data
1027  arm_compute::CLScheduler::get().sync();
1028 
1029  // Check the output is correct
1030  outputResult = reinterpret_cast<float*>(copyOutputPtr);
1031  CHECK(outputResult);
1032  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1033 
1034  // Query the profiler again, this will contain the results of both inferences
1035  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1036  dump = ss.str();
1037 
1038  // Contains Convolution2dWorkload
1039  found = dump.find("Convolution2dWorkload");
1040  CHECK(found != std::string::npos);
1041 
1042  // Should still contain the SyncMemGeneric
1043  found = dump.find("SyncMemGeneric");
1044  CHECK(found != std::string::npos);
1045 
1046  // Should now also contain a CopyMemGeneric
1047  found = dump.find("CopyMemGeneric");
1048  CHECK(found != std::string::npos);
1049  runtime->UnloadNetwork(netId);
1050 }
1051 
1052 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesInvertedEndToEndTest")
1053 {
1054 /*
1055  * This test is similar to the test above but instead of importing and then copying, we start by copying and then do
1056  * the import.
1057  */
1058  // Create runtime in which test will run
1059  IRuntime::CreationOptions options;
1060  IRuntimePtr runtime(armnn::IRuntime::Create(options));
1061 
1062  // build up the structure of the network
1063  INetworkPtr network(INetwork::Create());
1064 
1065  armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1066  armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
1067  armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1068 
1069  kernelInfo.SetConstant(true);
1070 
1071  std::vector<float> kernel =
1072  {
1073  4, 5, 6,
1074  0, 0, 0,
1075  3, 2, 1
1076  };
1077 
1078  const std::vector<float> expectedOutput =
1079  {
1080  23, 41, 33, 21,
1081  44, 65, 76, 52,
1082  82, 85, 79, 42
1083  };
1084 
1085  unsigned int numElements = inputInfo.GetNumElements();
1086  size_t totalBytes = numElements * sizeof(float);
1087 
1088  IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
1089  ARMNN_ASSERT(inputLayer);
1090 
1091  armnn::ConstTensor weights(kernelInfo, kernel);
1092 
1093  armnn::Convolution2dDescriptor convDesc2d;
1094  convDesc2d.m_StrideX = 1;
1095  convDesc2d.m_StrideY = 1;
1096  convDesc2d.m_PadLeft = 1;
1097  convDesc2d.m_PadRight = 1;
1098  convDesc2d.m_PadTop = 1;
1099  convDesc2d.m_PadBottom = 1;
1100  convDesc2d.m_DataLayout = DataLayout::NHWC;
1102  armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
1103  weights,
1105  "conv");
1107  ARMNN_ASSERT(convLayer);
1108 
1109  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
1110  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
1111 
1112  IConnectableLayer* output = network->AddOutputLayer(0, "output");
1113  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1114  convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
1115 
1116  // Optimize the network
1117  OptimizerOptions optOptions;
1118  optOptions.m_ImportEnabled = false;
1119  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
1120  IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
1121  CHECK(optNet);
1122 
1123  // Loads it into the runtime.
1124  NetworkId netId;
1125  std::string ignoredErrorMessage;
1126  // Enable Importing
1128  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
1129 
1130  // Creates structures for input & output
1131  const size_t alignment =
1132  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
1133  size_t space = totalBytes + alignment + alignment;
1134  auto inputData = std::make_unique<uint8_t[]>(space);
1135  void* copyInputPtr = inputData.get();
1136 
1137  // Fill input with values
1138  auto* inputPtr = reinterpret_cast<float*>(copyInputPtr);
1139  inputPtr[0] = 1;
1140  inputPtr[1] = 5;
1141  inputPtr[2] = 2;
1142  inputPtr[3] = 3;
1143  inputPtr[4] = 8;
1144  inputPtr[5] = 7;
1145  inputPtr[6] = 3;
1146  inputPtr[7] = 6;
1147  inputPtr[8] = 3;
1148  inputPtr[9] = 3;
1149  inputPtr[10] = 9;
1150  inputPtr[11] = 1;
1151 
1152  // Create output buffer and fill it with -10.0f
1153  auto outputData = std::make_unique<uint8_t[]>(space);
1154  void* copyOutputPtr = outputData.get();
1155  auto* outputPtr = reinterpret_cast<float*>(copyOutputPtr);
1156  std::fill_n(outputPtr, numElements, -10.0f);
1157 
1158  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
1159  inputTensorInfo.SetConstant(true);
1160  InputTensors inputTensors
1161  {
1162  {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1163  };
1164  OutputTensors outputTensors
1165  {
1166  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1167  };
1168 
1169  runtime->GetProfiler(netId)->EnableProfiling(true);
1170 
1171  // Do the inference without any pre-imported inputs/outputs
1172  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1173 
1174  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1176  std::stringstream ss;
1177  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1178  std::string dump = ss.str();
1179 
1180  // Contains Convolution2dWorkload
1181  std::size_t found = dump.find("Convolution2dWorkload");
1182  CHECK(found != std::string::npos);
1183 
1184  // Does not contain SyncMemGeneric
1185  found = dump.find("SyncMemGeneric");
1186  CHECK(found == std::string::npos);
1187 
1188  // Does contain CopyMemGeneric
1189  found = dump.find("CopyMemGeneric");
1190  CHECK(found != std::string::npos);
1191 
1192  // Sync the outputs so we can read the data
1193  arm_compute::CLScheduler::get().sync();
1194 
1195  // Check output is as expected
1196  auto* outputResult = reinterpret_cast<float*>(copyOutputPtr);
1197  CHECK(outputResult);
1198  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1199 
1200  // Repeat the inference, with new tensors and while using pre-importing to force it to import
1201 
1202  // Creates structures for input & output
1203  auto inputDataImport = std::make_unique<uint8_t[]>(space);
1204  void* alignedInputImportPtr = inputDataImport.get();
1205  CHECK(std::align(alignment, totalBytes, alignedInputImportPtr, space));
1206 
1207  // Fill input with values
1208  auto* inputImportPtr = reinterpret_cast<float*>(alignedInputImportPtr);
1209  inputImportPtr[0] = 1;
1210  inputImportPtr[1] = 5;
1211  inputImportPtr[2] = 2;
1212  inputImportPtr[3] = 3;
1213  inputImportPtr[4] = 8;
1214  inputImportPtr[5] = 7;
1215  inputImportPtr[6] = 3;
1216  inputImportPtr[7] = 6;
1217  inputImportPtr[8] = 3;
1218  inputImportPtr[9] = 3;
1219  inputImportPtr[10] = 9;
1220  inputImportPtr[11] = 1;
1221 
1222  // Output pre-filled with -10.0f
1223  auto outputDataImport = std::make_unique<uint8_t[]>(space);
1224  void* alignedOutputImportPtr = outputDataImport.get();
1225  CHECK(std::align(alignment, totalBytes, alignedOutputImportPtr, space));
1226  auto* outputImportPtr = reinterpret_cast<float*>(alignedOutputImportPtr);
1227  std::fill_n(outputImportPtr, numElements, -10.0f);
1228 
1229  InputTensors inputTensorsImport
1230  {
1231  {0,armnn::ConstTensor(inputTensorInfo, alignedInputImportPtr)},
1232  };
1233  OutputTensors outputTensorsImport
1234  {
1235  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputImportPtr)}
1236  };
1237 
1238  INFO("Run ImportInputs");
1239  std::vector<ImportedInputId> importedInputIds =
1240  runtime->ImportInputs(netId, inputTensorsImport, MemorySource::Malloc);
1241  std::vector<ImportedOutputId> importedOutputIds =
1242  runtime->ImportOutputs(netId, outputTensorsImport, MemorySource::Malloc);
1243 
1244  // Do the inference with pre-imported inputs/outputs
1245  runtime->EnqueueWorkload(netId, inputTensorsImport, outputTensorsImport, importedInputIds, importedOutputIds);
1246  // Sync the outputs so we can read the data
1247  arm_compute::CLScheduler::get().sync();
1248 
1249  // Check the output is correct
1250  outputResult = reinterpret_cast<float*>(alignedOutputImportPtr);
1251  CHECK(outputResult);
1252  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1253 
1254 
1255  // Query the profiler again, this will contain the results of both inferences
1256  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1257  dump = ss.str();
1258 
1259  // Contains Convolution2dWorkload
1260  found = dump.find("Convolution2dWorkload");
1261  CHECK(found != std::string::npos);
1262 
1263  // Should now contain the SyncMemGeneric
1264  found = dump.find("SyncMemGeneric");
1265  CHECK(found != std::string::npos);
1266 
1267  // Should still contain a CopyMemGeneric from the first inference
1268  found = dump.find("CopyMemGeneric");
1269  CHECK(found != std::string::npos);
1270  runtime->UnloadNetwork(netId);
1271 }
1272 
1273 }
TEST_SUITE("TestConstTensorLayerVisitor")
uint32_t m_PadBottom
Padding bottom value in the height dimension.
DataLayout m_DataLayout
The data layout to be used (NCHW, NHWC).
static IRuntimePtr Create(const CreationOptions &options)
Definition: Runtime.cpp:49
Interface for a layer that is connectable to other layers via InputSlots and OutputSlots.
Definition: INetwork.hpp:66
static ProfilerManager & GetInstance()
Definition: Profiling.cpp:572
#define ARMNN_NO_DEPRECATE_WARN_BEGIN
Definition: Deprecated.hpp:33
A Convolution2dDescriptor for the Convolution2dLayer.
IConnectableLayer * AddConvertFp32ToFp16Layer(const char *name=nullptr)
Definition: Network.cpp:2065
std::unique_ptr< IRuntime, void(*)(IRuntime *runtime)> IRuntimePtr
Definition: IRuntime.hpp:33
void Print(std::ostream &outStream) const
Print stats for events in JSON Format to the given output stream.
Definition: Profiling.cpp:609
std::vector< std::pair< LayerBindingId, class ConstTensor > > InputTensors
Definition: Tensor.hpp:392
uint32_t m_PadRight
Padding right value in the width dimension.
void AnalyzeEventsAndWriteResults(std::ostream &outStream) const
Analyzes the tracked events and writes the results to the given output stream.
Definition: Profiling.cpp:604
Copyright (c) 2021 ARM Limited and Contributors.
Private implementation of INetwork.
Definition: Network.hpp:31
virtual void SetTensorInfo(const TensorInfo &tensorInfo)=0
IProfiler * GetProfiler()
Definition: Profiling.cpp:584
A tensor defined by a TensorInfo (shape and data type) and a mutable backing store.
Definition: Tensor.hpp:319
uint32_t m_PadTop
Padding top value in the height dimension.
TEST_CASE_FIXTURE(ClContextControlFixture, "CopyBetweenNeonAndGpu")
uint32_t m_StrideX
Stride value when proceeding through input for the width dimension.
IConnectableLayer * AddInputLayer(LayerBindingId id, const char *name=nullptr)
Definition: Network.cpp:1903
#define ARMNN_NO_DEPRECATE_WARN_END
Definition: Deprecated.hpp:34
IOptimizedNetworkPtr Optimize(const INetwork &network, const std::vector< BackendId > &backendPreferences, const IDeviceSpec &deviceSpec, const OptimizerOptions &options=OptimizerOptions(), Optional< std::vector< std::string > &> messages=EmptyOptional())
Create an optimized version of the network.
Definition: Network.cpp:1847
IConnectableLayer * AddConvertFp16ToFp32Layer(const char *name=nullptr)
Definition: Network.cpp:2060
int NetworkId
Definition: IRuntime.hpp:27
A tensor defined by a TensorInfo (shape and data type) and an immutable backing store.
Definition: Tensor.hpp:327
std::vector< std::pair< LayerBindingId, class Tensor > > OutputTensors
Definition: Tensor.hpp:393
std::unique_ptr< IOptimizedNetwork, void(*)(IOptimizedNetwork *network)> IOptimizedNetworkPtr
Definition: INetwork.hpp:242
IConnectableLayer * AddOutputLayer(LayerBindingId id, const char *name=nullptr)
Definition: Network.cpp:2203
#define ARMNN_ASSERT(COND)
Definition: Assert.hpp:14
GPU Execution: OpenCL: ArmCompute.
ArmNN performs an optimization on each model/network before it gets loaded for execution.
Definition: INetwork.hpp:137
An ActivationDescriptor for the ActivationLayer.
Definition: Descriptors.hpp:36
This factory creates ClImportTensorHandles that refer to imported memory tensors. ...
uint32_t m_StrideY
Stride value when proceeding through input for the height dimension.
EmptyOptional is used to initialize the Optional class in case we want to have default value for an O...
Definition: Optional.hpp:32
MemorySource
Define the Memory Source to reduce copies.
Definition: Types.hpp:230
const Graph & GetGraph() const
Definition: Network.hpp:37
virtual const IInputSlot & GetInputSlot(unsigned int index) const =0
Get a const input slot handle by slot index.
void SetConstant(const bool IsConstant=true)
Marks the data corresponding to this tensor info as constant.
Definition: Tensor.cpp:514
virtual const IOutputSlot & GetOutputSlot(unsigned int index) const =0
Get the const output slot handle by slot index.
std::unique_ptr< INetwork, void(*)(INetwork *network)> INetworkPtr
Definition: INetwork.hpp:241
virtual int Connect(IInputSlot &destination)=0
std::unique_ptr< ITensorHandle > CreateTensorHandle(const TensorInfo &tensorInfo) const override
half_float::half Half
Definition: Half.hpp:18
static INetworkPtr Create(NetworkOptions networkOptions={})
Definition: Network.cpp:476
ActivationFunction m_Function
The activation function to use (Sigmoid, TanH, Linear, ReLu, BoundedReLu, SoftReLu, LeakyReLu, Abs, Sqrt, Square, Elu).
Definition: Descriptors.hpp:59
uint32_t m_PadLeft
Padding left value in the width dimension.
unsigned int GetNumElements() const
Definition: Tensor.hpp:196