ArmNN
 22.08
ClImportTensorHandleTests.cpp File Reference
#include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
#include <cl/ClImportTensorHandle.hpp>
#include <cl/ClImportTensorHandleFactory.hpp>
#include <cl/test/ClContextControlFixture.hpp>
#include <doctest/doctest.h>
#include <armnn/IRuntime.hpp>
#include <armnn/INetwork.hpp>
#include "Network.hpp"

Go to the source code of this file.

Functions

 TEST_SUITE ("ClImportTensorHandleTests")
 

Function Documentation

◆ TEST_SUITE()

TEST_SUITE ( "ClImportTensorHandleTests"  )

Definition at line 20 of file ClImportTensorHandleTests.cpp.

References NetworkImpl::AddConvertFp16ToFp32Layer(), NetworkImpl::AddConvertFp32ToFp16Layer(), NetworkImpl::AddInputLayer(), NetworkImpl::AddOutputLayer(), IProfiler::AnalyzeEventsAndWriteResults(), ARMNN_ASSERT, IOutputSlot::Connect(), IRuntime::Create(), INetwork::Create(), ClImportTensorHandleFactory::CreateTensorHandle(), armnn::Float16, armnn::Float32, NetworkImpl::GetGraph(), BaseTensor< MemoryType >::GetInfo(), IConnectableLayer::GetInputSlot(), ProfilerManager::GetInstance(), TensorInfo::GetNumElements(), IConnectableLayer::GetOutputSlot(), ProfilerManager::GetProfiler(), armnn::GpuAcc, armnn::info, Convolution2dDescriptor::m_DataLayout, OptimizerOptions::m_ExportEnabled, ActivationDescriptor::m_Function, OptimizerOptions::m_ImportEnabled, Convolution2dDescriptor::m_PadBottom, Convolution2dDescriptor::m_PadLeft, Convolution2dDescriptor::m_PadRight, Convolution2dDescriptor::m_PadTop, Convolution2dDescriptor::m_StrideX, Convolution2dDescriptor::m_StrideY, armnn::Malloc, armnn::NHWC, armnn::Optimize(), IProfiler::Print(), armnn::ReLu, TensorInfo::SetConstant(), IOutputSlot::SetTensorInfo(), TEST_CASE_FIXTURE(), and armnn::Undefined.

21 {
23 {
24  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
25  static_cast<MemorySourceFlags>(MemorySource::Malloc));
26 
27  TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
28  unsigned int numElements = info.GetNumElements();
29 
30  // create TensorHandle for memory import
31  auto handle = handleFactory.CreateTensorHandle(info);
32 
33  // Get CLtensor
34  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
35 
36  // Create and configure activation function
37  const arm_compute::ActivationLayerInfo act_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
38  arm_compute::CLActivationLayer act_func;
39  act_func.configure(&tensor, nullptr, act_info);
40 
41  // Allocate user memory
42  const size_t totalBytes = tensor.info()->total_size();
43  const size_t alignment =
44  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
45  size_t space = totalBytes + alignment + alignment;
46  auto testData = std::make_unique<uint8_t[]>(space);
47  void* alignedPtr = testData.get();
48  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
49 
50  // Import memory
51  CHECK(handle->Import(alignedPtr, armnn::MemorySource::Malloc));
52 
53  // Input with negative values
54  auto* typedPtr = reinterpret_cast<float*>(alignedPtr);
55  std::fill_n(typedPtr, numElements, -5.0f);
56 
57  // Execute function and sync
58  act_func.run();
59  arm_compute::CLScheduler::get().sync();
60 
61  // Validate result by checking that the output has no negative values
62  for(unsigned int i = 0; i < numElements; ++i)
63  {
64  CHECK(typedPtr[i] == 0);
65  }
66 }
67 
68 TEST_CASE_FIXTURE(ClContextControlFixture, "ClIncorrectMemorySourceImport")
69 {
70  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
71  static_cast<MemorySourceFlags>(MemorySource::Malloc));
72 
73  TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
74 
75  // create TensorHandle for memory import
76  auto handle = handleFactory.CreateTensorHandle(info);
77 
78  // Get CLtensor
79  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
80 
81  // Allocate user memory
82  const size_t totalBytes = tensor.info()->total_size();
83  const size_t alignment =
84  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
85  size_t space = totalBytes + alignment + alignment;
86  auto testData = std::make_unique<uint8_t[]>(space);
87  void* alignedPtr = testData.get();
88  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
89 
90  // Import memory
91  CHECK_THROWS_AS(handle->Import(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
92 }
93 
94 TEST_CASE_FIXTURE(ClContextControlFixture, "ClInvalidMemorySourceImport")
95 {
96  MemorySource invalidMemSource = static_cast<MemorySource>(256);
97  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(invalidMemSource),
98  static_cast<MemorySourceFlags>(invalidMemSource));
99 
100  TensorInfo info({ 1, 2, 2, 1 }, DataType::Float32);
101 
102  // create TensorHandle for memory import
103  auto handle = handleFactory.CreateTensorHandle(info);
104 
105  // Allocate user memory
106  std::vector<float> inputData
107  {
108  1.0f, 2.0f, 3.0f, 4.0f
109  };
110 
111  // Import non-support memory
112  CHECK_THROWS_AS(handle->Import(inputData.data(), invalidMemSource), MemoryImportException);
113 }
114 
115 TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd")
116 {
117  // Create runtime in which test will run
119  IRuntimePtr runtime(armnn::IRuntime::Create(options));
120 
121  // build up the structure of the network
122  INetworkPtr net(INetwork::Create());
123 
124  IConnectableLayer* input = net->AddInputLayer(0, "Input");
125 
126  ActivationDescriptor descriptor;
127  descriptor.m_Function = ActivationFunction::ReLu;
128  IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation");
129 
130  IConnectableLayer* output = net->AddOutputLayer(0, "Output");
131 
132  input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
133  activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
134 
135  TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32);
136  unsigned int numElements = tensorInfo.GetNumElements();
137  size_t totalBytes = numElements * sizeof(float);
138 
139  input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
140  activation->GetOutputSlot(0).SetTensorInfo(tensorInfo);
141 
142  // Optimize the network
143  OptimizerOptions optOptions;
144  optOptions.m_ImportEnabled = true;
145  optOptions.m_ExportEnabled = true;
146  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
147  IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
148  CHECK(optNet);
149 
150  // Loads it into the runtime.
151  NetworkId netId;
152  std::string ignoredErrorMessage;
153  // Enable Importing
154  INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
155  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
156 
157  // Creates structures for input & output
158  const size_t alignment =
159  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
160  size_t space = totalBytes + alignment + alignment;
161  auto inputData = std::make_unique<uint8_t[]>(space);
162  void* alignedInputPtr = inputData.get();
163  CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
164 
165  // Input with negative values
166  auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
167  std::fill_n(intputPtr, numElements, -5.0f);
168 
169  auto outputData = std::make_unique<uint8_t[]>(space);
170  void* alignedOutputPtr = outputData.get();
171  CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
172  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
173  std::fill_n(outputPtr, numElements, -10.0f);
174 
175  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
176  inputTensorInfo.SetConstant(true);
177  InputTensors inputTensors
178  {
179  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
180  };
181  OutputTensors outputTensors
182  {
183  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
184  };
185 
186  runtime->GetProfiler(netId)->EnableProfiling(true);
187 
188  // Do the inference
189  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
190 
191  // Retrieve the Profiler.Print() output to get the workload execution
193  std::stringstream ss;
194  profilerManager.GetProfiler()->Print(ss);;
195  std::string dump = ss.str();
196 
197  // Contains ActivationWorkload
198  std::size_t found = dump.find("ActivationWorkload");
199  CHECK(found != std::string::npos);
200 
201  // Contains SyncMemGeneric
202  found = dump.find("SyncMemGeneric");
203  CHECK(found != std::string::npos);
204 
205  // Does not contain CopyMemGeneric
206  found = dump.find("CopyMemGeneric");
207  CHECK(found == std::string::npos);
208 
209  runtime->UnloadNetwork(netId);
210 
211  // Check output is as expected
212  // Validate result by checking that the output has no negative values
213  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
214  CHECK(outputResult);
215  for(unsigned int i = 0; i < numElements; ++i)
216  {
217  CHECK(outputResult[i] >= 0);
218  }
219 }
220 
221 TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported")
222 {
223  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
224  static_cast<MemorySourceFlags>(MemorySource::Malloc));
225 
226  TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
227 
228  // create TensorHandle for memory import
229  auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
230 
231  // Get CLtensor
232  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
233 
234  // Allocate user memory
235  const size_t totalBytes = tensor.info()->total_size();
236  const size_t alignment =
237  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
238  size_t space = totalBytes + alignment + alignment;
239  auto testData = std::make_unique<uint8_t[]>(space);
240  void* alignedPtr = testData.get();
241  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
242 
243  // Import memory
244  CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
245 
246 }
247 
248 TEST_CASE("ClCanBeImportedAlignedMemory")
249 {
250  ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
251  static_cast<MemorySourceFlags>(MemorySource::Malloc));
252 
253  TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32);
254 
255  // create TensorHandle (Memory Managed status is irrelevant)
256  auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
257  // Get CLtensor
258  arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
259 
260  // Create an aligned buffer
261  const size_t totalBytes = tensor.info()->total_size();
262  const size_t alignment =
263  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
264  size_t space = totalBytes + alignment + alignment;
265  auto testData = std::make_unique<uint8_t[]>(space);
266  void* alignedPtr = testData.get();
267  CHECK(std::align(alignment, totalBytes, alignedPtr, space));
268 
269  // Check aligned buffers return true
270  CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true);
271 
272  // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu
273  // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail.
274  // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true
275  // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
276 }
277 
278 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
279 {
280  // Create runtime in which test will run
282  IRuntimePtr runtime(armnn::IRuntime::Create(options));
283 
284  // build up the structure of the network
285  INetworkPtr network(INetwork::Create());
286 
287  armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
288  armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
289  armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
290 
291  kernelInfo.SetConstant(true);
292 
293  std::vector<float> kernel =
294  {
295  4, 5, 6,
296  0, 0, 0,
297  3, 2, 1
298  };
299 
300  const std::vector<float> expectedOutput =
301  {
302  23, 41, 33, 21,
303  44, 65, 76, 52,
304  82, 85, 79, 42
305  };
306 
307  unsigned int numElements = inputInfo.GetNumElements();
308  size_t totalBytes = numElements * sizeof(float);
309 
310  IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
311  ARMNN_ASSERT(inputLayer);
312 
313  armnn::ConstTensor weights(kernelInfo, kernel);
314 
316  convDesc2d.m_StrideX = 1;
317  convDesc2d.m_StrideY = 1;
318  convDesc2d.m_PadLeft = 1;
319  convDesc2d.m_PadRight = 1;
320  convDesc2d.m_PadTop = 1;
321  convDesc2d.m_PadBottom = 1;
322  convDesc2d.m_DataLayout = DataLayout::NHWC;
323 
324  armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
325  armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
326 
327  ARMNN_ASSERT(convLayer);
328 
329  weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
330  weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
331 
332  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
333  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
334 
335  IConnectableLayer* output = network->AddOutputLayer(0, "output");
336  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
337  convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
338 
339  // Optimize the network
340  OptimizerOptions optOptions;
341  optOptions.m_ImportEnabled = false;
342  optOptions.m_ExportEnabled = false;
343  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
344  IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
345  CHECK(optNet);
346 
347  // Loads it into the runtime.
348  NetworkId netId;
349  std::string ignoredErrorMessage;
350  // Enable Importing
351  INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
352  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
353 
354  // Creates structures for input & output
355  const size_t alignment =
356  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
357  size_t space = totalBytes + alignment + alignment;
358  auto inputData = std::make_unique<uint8_t[]>(space);
359  void* alignedInputPtr = inputData.get();
360  CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
361 
362  // Input with negative values
363  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
364  inputPtr[0] = 1;
365  inputPtr[1] = 5;
366  inputPtr[2] = 2;
367  inputPtr[3] = 3;
368  inputPtr[4] = 8;
369  inputPtr[5] = 7;
370  inputPtr[6] = 3;
371  inputPtr[7] = 6;
372  inputPtr[8] = 3;
373  inputPtr[9] = 3;
374  inputPtr[10] = 9;
375  inputPtr[11] = 1;
376 
377 
378  auto outputData = std::make_unique<uint8_t[]>(space);
379  void* alignedOutputPtr = outputData.get();
380  CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
381  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
382  std::fill_n(outputPtr, numElements, -10.0f);
383 
384  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
385  inputTensorInfo.SetConstant(true);
386  InputTensors inputTensors
387  {
388  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
389  };
390  OutputTensors outputTensors
391  {
392  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
393  };
394 
395  runtime->GetProfiler(netId)->EnableProfiling(true);
396 
397  INFO("Run ImportInputs");
398  std::vector<ImportedInputId> importedInputIds =
399  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
400  // We expect the import to have succeeded.
401  CHECK(importedInputIds.size() == 1);
402  std::vector<ImportedOutputId> importedOutputIds =
403  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
404  // We expect the import to have succeeded.
405  CHECK(importedOutputIds.size() == 1);
406  // Do the inference
407  runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
408 
409  // Retrieve the Profiler.Print() output to get the workload execution
411  std::stringstream ss;
412  profilerManager.GetProfiler()->Print(ss);;
413  std::string dump = ss.str();
414 
415  // Contains Convolution2dWorkload
416  std::size_t found = dump.find("Convolution2dWorkload");
417  CHECK(found != std::string::npos);
418 
419  // Contains SyncMemGeneric
420  found = dump.find("SyncMemGeneric");
421  CHECK(found != std::string::npos);
422 
423  // Does not contain CopyMemGeneric
424  found = dump.find("CopyMemGeneric");
425  CHECK(found == std::string::npos);
426 
427  runtime->UnloadNetwork(netId);
428 
429  // Check output is as expected
430  // Validate result by checking that the output has no negative values
431  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
432  CHECK(outputResult);
433 
434  // Check the output is correct
435  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
436 }
437 
438 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd")
439 {
440  using namespace half_float::literal;
441 
442  // Create runtime in which test will run
444  IRuntimePtr runtime(armnn::IRuntime::Create(options));
445 
446  // build up the structure of the network
447  NetworkImpl network;
448 
449  armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16);
450  armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
451 
452  std::vector<float> expectedOutput =
453  {
454  -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
455  1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
456  };
457 
458  unsigned int numElements = inputInfo.GetNumElements();
459  size_t totalBytesInput = numElements * sizeof(Half);
460  size_t totalBytesOutput = numElements * sizeof(float);
461 
462  IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
463  ARMNN_ASSERT(inputLayer);
464 
465  armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert");
466  ARMNN_ASSERT(convLayer);
467 
468  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
469  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
470 
471  IConnectableLayer* output = network.AddOutputLayer(0, "output");
472  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
473  convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
474 
475  // Optimize the network
476  OptimizerOptions optOptions;
477  optOptions.m_ImportEnabled = false;
478  optOptions.m_ExportEnabled = false;
479  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
480  IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
481  CHECK(optNet);
482 
483  // Loads it into the runtime.
484  NetworkId netId;
485  std::string ignoredErrorMessage;
486  // Enable Importing
487  INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
488  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
489 
490  // Creates structures for input & output
491  const size_t alignment =
492  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
493  size_t spaceInput = totalBytesInput + alignment + alignment;
494  size_t spaceOutput = totalBytesOutput + alignment + alignment;
495  auto inputData = std::make_unique<uint8_t[]>(spaceInput);
496  void* alignedInputPtr = inputData.get();
497  CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
498 
499  // Input with negative values
500  auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr);
501  inputPtr[0] = -37.5_h;
502  inputPtr[1] = -15.2_h;
503  inputPtr[2] = -8.76_h;
504  inputPtr[3] = -2.0_h;
505  inputPtr[4] = -1.5_h;
506  inputPtr[5] = -1.3_h;
507  inputPtr[6] = -0.5_h;
508  inputPtr[7] = -0.4_h;
509  inputPtr[8] = 0.0_h;
510  inputPtr[9] = 1.0_h;
511  inputPtr[10] = 0.4_h;
512  inputPtr[11] = 0.5_h;
513  inputPtr[12] = 1.3_h;
514  inputPtr[13] = 1.5_h;
515  inputPtr[14] = 2.0_h;
516  inputPtr[15] = 8.76_h;
517  inputPtr[16] = 15.2_h;
518  inputPtr[17] = 37.5_h;
519 
520  auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
521  void* alignedOutputPtr = outputData.get();
522  CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
523  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
524  std::fill_n(outputPtr, numElements, -10.0f);
525 
526  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
527  inputTensorInfo.SetConstant(true);
528  InputTensors inputTensors
529  {
530  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
531  };
532  OutputTensors outputTensors
533  {
534  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
535  };
536 
537  runtime->GetProfiler(netId)->EnableProfiling(true);
538 
539  INFO("Run ImportInputs");
540  std::vector<ImportedInputId> importedInputIds =
541  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
542  // We expect the import to have succeeded.
543  CHECK(importedInputIds.size() == 1);
544  std::vector<ImportedOutputId> importedOutputIds =
545  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
546  // We expect the import to have succeeded.
547  CHECK(importedOutputIds.size() == 1);
548 
549  // Do the inference
550  runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
551 
552  // Retrieve the Profiler.Print() output to get the workload execution
554  std::stringstream ss;
555  profilerManager.GetProfiler()->Print(ss);;
556  std::string dump = ss.str();
557 
558  // Contains Convolution2dWorkload
559  std::size_t found = dump.find("ConvertFp16ToFp32Workload");
560  CHECK(found != std::string::npos);
561 
562  // Contains SyncMemGeneric
563  found = dump.find("SyncMemGeneric");
564  CHECK(found != std::string::npos);
565 
566  // Does not contain CopyMemGeneric
567  found = dump.find("CopyMemGeneric");
568  CHECK(found == std::string::npos);
569 
570  runtime->UnloadNetwork(netId);
571 
572  // Check output is as expected
573  // Validate result by checking that the output has no negative values
574  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
575  CHECK(outputResult);
576 
577  // Check the output is correct
578  for (size_t i = 0; i < numElements; ++i)
579  {
580  DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004),
581  "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]);
582  }
583 }
584 
585 
586 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd")
587 {
588  using namespace half_float::literal;
589 
590  // Create runtime in which test will run
592  IRuntimePtr runtime(armnn::IRuntime::Create(options));
593 
594  // build up the structure of the network
595  NetworkImpl network;
596 
597  armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32);
598  armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
599 
600  std::vector<Half> expectedOutput =
601  {
602  -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
603  1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h
604  };
605 
606  unsigned int numElements = inputInfo.GetNumElements();
607  size_t totalBytesInput = numElements * sizeof(float);
608  size_t totalBytesOutput = numElements * sizeof(Half);
609 
610  IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
611  ARMNN_ASSERT(inputLayer);
612 
613  armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
614  ARMNN_ASSERT(convLayer);
615 
616  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
617  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
618 
619  IConnectableLayer* output = network.AddOutputLayer(0, "output");
620  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
621  convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
622 
623  // Optimize the network
624  OptimizerOptions optOptions;
625  optOptions.m_ImportEnabled = false;
626  optOptions.m_ExportEnabled = false;
627  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
628  IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
629  CHECK(optNet);
630 
631  // Loads it into the runtime.
632  NetworkId netId;
633  std::string ignoredErrorMessage;
634  // Enable Importing
635  INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
636  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
637 
638  // Creates structures for input & output
639  const size_t alignment =
640  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
641  size_t spaceInput = totalBytesInput + alignment + alignment;
642  size_t spaceOutput = totalBytesOutput + alignment + alignment;
643  auto inputData = std::make_unique<uint8_t[]>(spaceInput);
644  void* alignedInputPtr = inputData.get();
645  CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
646 
647  // Input with negative values
648  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
649  inputPtr[0] = -37.5f;
650  inputPtr[1] = -15.2f;
651  inputPtr[2] = -8.76f;
652  inputPtr[3] = -2.0f;
653  inputPtr[4] = -1.5f;
654  inputPtr[5] = -1.3f;
655  inputPtr[6] = -0.5f;
656  inputPtr[7] = -0.4f;
657  inputPtr[8] = 0.0f;
658  inputPtr[9] = 1.0f;
659  inputPtr[10] = 0.4f;
660  inputPtr[11] = 0.5f;
661  inputPtr[12] = 1.3f;
662  inputPtr[13] = 1.5f;
663  inputPtr[14] = 2.0f;
664  inputPtr[15] = 8.76f;
665  inputPtr[16] = 15.2f;
666  inputPtr[17] = 37.5f;
667 
668  auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
669  void* alignedOutputPtr = outputData.get();
670  CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
671  auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
672  std::fill_n(outputPtr, numElements, -10.0f);
673 
674  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
675  inputTensorInfo.SetConstant(true);
676  InputTensors inputTensors
677  {
678  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
679  };
680  OutputTensors outputTensors
681  {
682  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
683  };
684 
685  runtime->GetProfiler(netId)->EnableProfiling(true);
686 
687  INFO("Run ImportInputs");
688  std::vector<ImportedInputId> importedInputIds =
689  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
690  // We expect the import to have succeeded.
691  CHECK(importedInputIds.size() == 1);
692  std::vector<ImportedOutputId> importedOutputIds =
693  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
694  // We expect the import to have succeeded.
695  CHECK(importedOutputIds.size() == 1);
696 
697  // Do the inference
698  runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
699 
700  // Retrieve the Profiler.Print() output to get the workload execution
702  std::stringstream ss;
703  profilerManager.GetProfiler()->Print(ss);;
704  std::string dump = ss.str();
705 
706  // Contains Convolution2dWorkload
707  std::size_t found = dump.find("ConvertFp32ToFp16Workload");
708  CHECK(found != std::string::npos);
709 
710  // Contains SyncMemGeneric
711  found = dump.find("SyncMemGeneric");
712  CHECK(found != std::string::npos);
713 
714  // Does not contain CopyMemGeneric
715  found = dump.find("CopyMemGeneric");
716  CHECK(found == std::string::npos);
717 
718  runtime->UnloadNetwork(netId);
719 
720  // Check output is as expected
721  // Validate result by checking that the output has no negative values
722  auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
723  CHECK(outputResult);
724 
725  // Check the output is correct
726  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
727 }
728 
729 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd")
730 {
731  using namespace half_float::literal;
732 
733  // Create runtime in which test will run
735  IRuntimePtr runtime(armnn::IRuntime::Create(options));
736 
737  // build up the structure of the network
738  NetworkImpl network;
739 
741  armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16);
742 
743  std::vector<Half> expectedOutput = { 1.0_h };
744 
745  unsigned int numElements = inputInfo.GetNumElements();
746  size_t totalBytesInput = numElements * sizeof(float);
747  size_t totalBytesOutput = numElements * sizeof(Half);
748 
749  IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
750  ARMNN_ASSERT(inputLayer);
751 
752  armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
753  ARMNN_ASSERT(convLayer);
754 
755  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
756  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
757 
758  IConnectableLayer* output = network.AddOutputLayer(0, "output");
759  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
760  convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
761 
762  // Optimize the network
763  OptimizerOptions optOptions;
764  optOptions.m_ImportEnabled = false;
765  optOptions.m_ExportEnabled = false;
766  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
767  IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
768  CHECK(optNet);
769 
770  // Loads it into the runtime.
771  NetworkId netId;
772  std::string ignoredErrorMessage;
773  // Enable Importing
774  INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
775  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
776 
777  // Creates structures for input & output
778  const size_t alignment =
779  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
780  size_t spaceInput = totalBytesInput + alignment + alignment;
781  size_t spaceOutput = totalBytesOutput + alignment + alignment;
782  auto inputData = std::make_unique<uint8_t[]>(spaceInput);
783  void* alignedInputPtr = inputData.get();
784  CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
785 
786  // Input with negative values
787  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
788  inputPtr[0] = 1.0f;
789 
790  auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
791  void* alignedOutputPtr = outputData.get();
792  CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
793  auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
794  std::fill_n(outputPtr, numElements, -10.0f);
795 
796  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
797  inputTensorInfo.SetConstant(true);
798  InputTensors inputTensors
799  {
800  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
801  };
802  OutputTensors outputTensors
803  {
804  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
805  };
806 
807  runtime->GetProfiler(netId)->EnableProfiling(true);
808 
809  INFO("Run ImportInputs");
810  std::vector<ImportedInputId> importedInputIds =
811  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
812  CHECK(importedInputIds.size() == 1);
813  std::vector<ImportedOutputId> importedOutputIds =
814  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
815  CHECK(importedOutputIds.size() == 1);
816 
817  // Do the inference
818  runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
819 
820  // Retrieve the Profiler.Print() output to get the workload execution
822  std::stringstream ss;
823  profilerManager.GetProfiler()->Print(ss);;
824  std::string dump = ss.str();
825 
826  // Contains Convolution2dWorkload
827  std::size_t found = dump.find("ConvertFp32ToFp16Workload");
828  CHECK(found != std::string::npos);
829 
830  // Contains SyncMemGeneric
831  found = dump.find("SyncMemGeneric");
832  CHECK(found != std::string::npos);
833 
834  // Does not contain CopyMemGeneric
835  found = dump.find("CopyMemGeneric");
836  CHECK(found == std::string::npos);
837 
838  runtime->UnloadNetwork(netId);
839 
840  // Check output is as expected
841  // Validate result by checking that the output has no negative values
842  auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
843  CHECK(outputResult);
844 
845  // Check the output is correct
846  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
847 }
848 
849 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest")
850 {
851 /*
852  * This is a test to check the functionality of the Forced Import functionality when using repeated inferences that
853  * require switching from importing to copy. For the first inference we create aligned Pointers and check they are
854  * imported correctly. For the second we use similar pointers but don't use PreImporting.
855  */
856  // Create runtime in which test will run
858  IRuntimePtr runtime(armnn::IRuntime::Create(options));
859 
860  // build up the structure of the network
861  INetworkPtr network(INetwork::Create());
862 
863  armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
864  armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
865  armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
866 
867  kernelInfo.SetConstant(true);
868 
869  std::vector<float> kernel =
870  {
871  4, 5, 6,
872  0, 0, 0,
873  3, 2, 1
874  };
875 
876  const std::vector<float> expectedOutput =
877  {
878  23, 41, 33, 21,
879  44, 65, 76, 52,
880  82, 85, 79, 42
881  };
882 
883  unsigned int numElements = inputInfo.GetNumElements();
884  size_t totalBytes = numElements * sizeof(float);
885 
886  IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
887  ARMNN_ASSERT(inputLayer);
888 
889  armnn::ConstTensor weights(kernelInfo, kernel);
890 
892  convDesc2d.m_StrideX = 1;
893  convDesc2d.m_StrideY = 1;
894  convDesc2d.m_PadLeft = 1;
895  convDesc2d.m_PadRight = 1;
896  convDesc2d.m_PadTop = 1;
897  convDesc2d.m_PadBottom = 1;
898  convDesc2d.m_DataLayout = DataLayout::NHWC;
899  armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
900  ARMNN_ASSERT(convLayer);
901 
902  armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
903 
904  weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
905  weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
906 
907  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
908  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
909 
910  IConnectableLayer* output = network->AddOutputLayer(0, "output");
911  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
912  convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
913 
914  // Optimize the network
915  OptimizerOptions optOptions;
916  optOptions.m_ImportEnabled = false;
917  optOptions.m_ExportEnabled = false;
918  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
919  IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
920  CHECK(optNet);
921 
922  // Loads it into the runtime.
923  NetworkId netId;
924  std::string ignoredErrorMessage;
925  // Enable Importing
926  INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
927  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
928 
929  // Creates structures for input & output
930  const size_t alignment =
931  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
932  size_t space = totalBytes + alignment + alignment;
933  auto inputData = std::make_unique<uint8_t[]>(space);
934  void* alignedInputPtr = inputData.get();
935  CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
936 
937  // Fill input with values
938  auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
939  inputPtr[0] = 1;
940  inputPtr[1] = 5;
941  inputPtr[2] = 2;
942  inputPtr[3] = 3;
943  inputPtr[4] = 8;
944  inputPtr[5] = 7;
945  inputPtr[6] = 3;
946  inputPtr[7] = 6;
947  inputPtr[8] = 3;
948  inputPtr[9] = 3;
949  inputPtr[10] = 9;
950  inputPtr[11] = 1;
951 
952 
953  auto outputData = std::make_unique<uint8_t[]>(space);
954  void* alignedOutputPtr = outputData.get();
955  CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
956  auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
957  std::fill_n(outputPtr, numElements, -10.0f);
958 
959  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
960  inputTensorInfo.SetConstant(true);
961  InputTensors inputTensors
962  {
963  {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
964  };
965  OutputTensors outputTensors
966  {
967  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
968  };
969 
970  runtime->GetProfiler(netId)->EnableProfiling(true);
971 
972  INFO("Run ImportInputs");
973  std::vector<ImportedInputId> importedInputIds =
974  runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
975  // We expect the import to have succeeded.
976  CHECK(importedInputIds.size() == 1);
977  std::vector<ImportedOutputId> importedOutputIds =
978  runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
979  // We expect the import to have succeeded.
980  CHECK(importedOutputIds.size() == 1);
981 
982  // Do the inference
983  runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
984 
985  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
987  std::stringstream ss;
988  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
989  std::string dump = ss.str();
990 
991  // Contains Convolution2dWorkload
992  std::size_t found = dump.find("Convolution2dWorkload");
993  CHECK(found != std::string::npos);
994 
995  // Contains SyncMemGeneric
996  found = dump.find("SyncMemGeneric");
997  CHECK(found != std::string::npos);
998 
999  // Does not contain CopyMemGeneric
1000  found = dump.find("CopyMemGeneric");
1001  CHECK(found == std::string::npos);
1002 
1003  // Sync the outputs so we can read the data
1004  arm_compute::CLScheduler::get().sync();
1005 
1006  // Check output is as expected
1007  auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
1008  CHECK(outputResult);
1009  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1010 
1011  // Repeat the inference, with new tensors and without using PreImporting to force it to fall back to copying
1012 
1013  // Creates structures for input & output
1014  auto inputDataCopy = std::make_unique<uint8_t[]>(space);
1015  void* copyInputPtr = inputDataCopy.get();
1016 
1017  // Fill input with values
1018  auto* inputCopyPtr = reinterpret_cast<float*>(copyInputPtr);
1019  inputCopyPtr[0] = 1;
1020  inputCopyPtr[1] = 5;
1021  inputCopyPtr[2] = 2;
1022  inputCopyPtr[3] = 3;
1023  inputCopyPtr[4] = 8;
1024  inputCopyPtr[5] = 7;
1025  inputCopyPtr[6] = 3;
1026  inputCopyPtr[7] = 6;
1027  inputCopyPtr[8] = 3;
1028  inputCopyPtr[9] = 3;
1029  inputCopyPtr[10] = 9;
1030  inputCopyPtr[11] = 1;
1031 
1032  // Output pre-filled with -10.0f
1033  auto outputDataCopy = std::make_unique<uint8_t[]>(space);
1034  void* copyOutputPtr = outputDataCopy.get();
1035  auto* outputCopyPtr = reinterpret_cast<float*>(copyOutputPtr);
1036  std::fill_n(outputCopyPtr, numElements, -10.0f);
1037 
1038  InputTensors inputTensorsCopy
1039  {
1040  {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1041  };
1042  OutputTensors outputTensorsCopy
1043  {
1044  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1045  };
1046 
1047  // Do the inference without any pre-imported input/output ids
1048  runtime->EnqueueWorkload(netId, inputTensorsCopy, outputTensorsCopy);
1049  // Sync the outputs so we can read the data
1050  arm_compute::CLScheduler::get().sync();
1051 
1052  // Check the output is correct
1053  outputResult = reinterpret_cast<float*>(copyOutputPtr);
1054  CHECK(outputResult);
1055  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1056 
1057  // Query the profiler again, this will contain the results of both inferences
1058  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1059  dump = ss.str();
1060 
1061  // Contains Convolution2dWorkload
1062  found = dump.find("Convolution2dWorkload");
1063  CHECK(found != std::string::npos);
1064 
1065  // Should still contain the SyncMemGeneric
1066  found = dump.find("SyncMemGeneric");
1067  CHECK(found != std::string::npos);
1068 
1069  // Should now also contain a CopyMemGeneric
1070  found = dump.find("CopyMemGeneric");
1071  CHECK(found != std::string::npos);
1072  runtime->UnloadNetwork(netId);
1073 }
1074 
1075 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesInvertedEndToEndTest")
1076 {
1077 /*
1078  * This test is similar to the test above but instead of importing and then copying, we start by copying and then do
1079  * the import.
1080  */
1081  // Create runtime in which test will run
1082  IRuntime::CreationOptions options;
1083  IRuntimePtr runtime(armnn::IRuntime::Create(options));
1084 
1085  // build up the structure of the network
1086  INetworkPtr network(INetwork::Create());
1087 
1088  armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1089  armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
1090  armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1091 
1092  kernelInfo.SetConstant(true);
1093 
1094  std::vector<float> kernel =
1095  {
1096  4, 5, 6,
1097  0, 0, 0,
1098  3, 2, 1
1099  };
1100 
1101  const std::vector<float> expectedOutput =
1102  {
1103  23, 41, 33, 21,
1104  44, 65, 76, 52,
1105  82, 85, 79, 42
1106  };
1107 
1108  unsigned int numElements = inputInfo.GetNumElements();
1109  size_t totalBytes = numElements * sizeof(float);
1110 
1111  IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
1112  ARMNN_ASSERT(inputLayer);
1113 
1114  armnn::ConstTensor weights(kernelInfo, kernel);
1115 
1116  armnn::Convolution2dDescriptor convDesc2d;
1117  convDesc2d.m_StrideX = 1;
1118  convDesc2d.m_StrideY = 1;
1119  convDesc2d.m_PadLeft = 1;
1120  convDesc2d.m_PadRight = 1;
1121  convDesc2d.m_PadTop = 1;
1122  convDesc2d.m_PadBottom = 1;
1123  convDesc2d.m_DataLayout = DataLayout::NHWC;
1124 
1125  armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
1126  ARMNN_ASSERT(convLayer);
1127 
1128  armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
1129 
1130  weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
1131  weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
1132 
1133  inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
1134  inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
1135 
1136  IConnectableLayer* output = network->AddOutputLayer(0, "output");
1137  convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1138  convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
1139 
1140  // Optimize the network
1141  OptimizerOptions optOptions;
1142  optOptions.m_ImportEnabled = false;
1143  optOptions.m_ExportEnabled = false;
1144  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
1145  IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
1146  CHECK(optNet);
1147 
1148  // Loads it into the runtime.
1149  NetworkId netId;
1150  std::string ignoredErrorMessage;
1151  // Enable Importing
1152  INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1153  runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
1154 
1155  // Creates structures for input & output
1156  const size_t alignment =
1157  arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
1158  size_t space = totalBytes + alignment + alignment;
1159  auto inputData = std::make_unique<uint8_t[]>(space);
1160  void* copyInputPtr = inputData.get();
1161 
1162  // Fill input with values
1163  auto* inputPtr = reinterpret_cast<float*>(copyInputPtr);
1164  inputPtr[0] = 1;
1165  inputPtr[1] = 5;
1166  inputPtr[2] = 2;
1167  inputPtr[3] = 3;
1168  inputPtr[4] = 8;
1169  inputPtr[5] = 7;
1170  inputPtr[6] = 3;
1171  inputPtr[7] = 6;
1172  inputPtr[8] = 3;
1173  inputPtr[9] = 3;
1174  inputPtr[10] = 9;
1175  inputPtr[11] = 1;
1176 
1177  // Create output buffer and fill it with -10.0f
1178  auto outputData = std::make_unique<uint8_t[]>(space);
1179  void* copyOutputPtr = outputData.get();
1180  auto* outputPtr = reinterpret_cast<float*>(copyOutputPtr);
1181  std::fill_n(outputPtr, numElements, -10.0f);
1182 
1183  TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
1184  inputTensorInfo.SetConstant(true);
1185  InputTensors inputTensors
1186  {
1187  {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1188  };
1189  OutputTensors outputTensors
1190  {
1191  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1192  };
1193 
1194  runtime->GetProfiler(netId)->EnableProfiling(true);
1195 
1196  // Do the inference without any pre-imported inputs/outputs
1197  runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1198 
1199  // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1201  std::stringstream ss;
1202  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1203  std::string dump = ss.str();
1204 
1205  // Contains Convolution2dWorkload
1206  std::size_t found = dump.find("Convolution2dWorkload");
1207  CHECK(found != std::string::npos);
1208 
1209  // Does not contain SyncMemGeneric
1210  found = dump.find("SyncMemGeneric");
1211  CHECK(found == std::string::npos);
1212 
1213  // Does contain CopyMemGeneric
1214  found = dump.find("CopyMemGeneric");
1215  CHECK(found != std::string::npos);
1216 
1217  // Sync the outputs so we can read the data
1218  arm_compute::CLScheduler::get().sync();
1219 
1220  // Check output is as expected
1221  auto* outputResult = reinterpret_cast<float*>(copyOutputPtr);
1222  CHECK(outputResult);
1223  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1224 
1225  // Repeat the inference, with new tensors and while using pre-importing to force it to import
1226 
1227  // Creates structures for input & output
1228  auto inputDataImport = std::make_unique<uint8_t[]>(space);
1229  void* alignedInputImportPtr = inputDataImport.get();
1230  CHECK(std::align(alignment, totalBytes, alignedInputImportPtr, space));
1231 
1232  // Fill input with values
1233  auto* inputImportPtr = reinterpret_cast<float*>(alignedInputImportPtr);
1234  inputImportPtr[0] = 1;
1235  inputImportPtr[1] = 5;
1236  inputImportPtr[2] = 2;
1237  inputImportPtr[3] = 3;
1238  inputImportPtr[4] = 8;
1239  inputImportPtr[5] = 7;
1240  inputImportPtr[6] = 3;
1241  inputImportPtr[7] = 6;
1242  inputImportPtr[8] = 3;
1243  inputImportPtr[9] = 3;
1244  inputImportPtr[10] = 9;
1245  inputImportPtr[11] = 1;
1246 
1247  // Output pre-filled with -10.0f
1248  auto outputDataImport = std::make_unique<uint8_t[]>(space);
1249  void* alignedOutputImportPtr = outputDataImport.get();
1250  CHECK(std::align(alignment, totalBytes, alignedOutputImportPtr, space));
1251  auto* outputImportPtr = reinterpret_cast<float*>(alignedOutputImportPtr);
1252  std::fill_n(outputImportPtr, numElements, -10.0f);
1253 
1254  InputTensors inputTensorsImport
1255  {
1256  {0,armnn::ConstTensor(inputTensorInfo, alignedInputImportPtr)},
1257  };
1258  OutputTensors outputTensorsImport
1259  {
1260  {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputImportPtr)}
1261  };
1262 
1263  INFO("Run ImportInputs");
1264  std::vector<ImportedInputId> importedInputIds =
1265  runtime->ImportInputs(netId, inputTensorsImport, MemorySource::Malloc);
1266  CHECK(importedInputIds.size() == 1);
1267  std::vector<ImportedOutputId> importedOutputIds =
1268  runtime->ImportOutputs(netId, outputTensorsImport, MemorySource::Malloc);
1269  CHECK(importedOutputIds.size() == 1);
1270 
1271  // Do the inference with pre-imported inputs/outputs
1272  runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
1273  // Sync the outputs so we can read the data
1274  arm_compute::CLScheduler::get().sync();
1275 
1276  // Check the output is correct
1277  outputResult = reinterpret_cast<float*>(alignedOutputImportPtr);
1278  CHECK(outputResult);
1279  CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1280 
1281 
1282  // Query the profiler again, this will contain the results of both inferences
1283  profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1284  dump = ss.str();
1285 
1286  // Contains Convolution2dWorkload
1287  found = dump.find("Convolution2dWorkload");
1288  CHECK(found != std::string::npos);
1289 
1290  // Should now contain the SyncMemGeneric
1291  found = dump.find("SyncMemGeneric");
1292  CHECK(found != std::string::npos);
1293 
1294  // Should still contain a CopyMemGeneric from the first inference
1295  found = dump.find("CopyMemGeneric");
1296  CHECK(found != std::string::npos);
1297  runtime->UnloadNetwork(netId);
1298 }
1299 
1300 }
uint32_t m_PadBottom
Padding bottom value in the height dimension.
DataLayout m_DataLayout
The data layout to be used (NCHW, NHWC).
static IRuntimePtr Create(const CreationOptions &options)
Definition: Runtime.cpp:49
Interface for a layer that is connectable to other layers via InputSlots and OutputSlots.
Definition: INetwork.hpp:68
static ProfilerManager & GetInstance()
Definition: Profiling.cpp:572
A Convolution2dDescriptor for the Convolution2dLayer.
IConnectableLayer * AddConstantLayer(const ConstTensor &input, const char *name=nullptr)
Definition: Network.cpp:2285
IConnectableLayer * AddConvertFp32ToFp16Layer(const char *name=nullptr)
Definition: Network.cpp:2084
std::unique_ptr< IRuntime, void(*)(IRuntime *runtime)> IRuntimePtr
Definition: IRuntime.hpp:33
void Print(std::ostream &outStream) const
Print stats for events in JSON Format to the given output stream.
Definition: Profiling.cpp:609
IConnectableLayer * AddConvolution2dLayer(const Convolution2dDescriptor &convolution2dDescriptor, const char *name=nullptr)
Definition: Network.cpp:2047
std::vector< std::pair< LayerBindingId, class ConstTensor > > InputTensors
Definition: Tensor.hpp:392
uint32_t m_PadRight
Padding right value in the width dimension.
void AnalyzeEventsAndWriteResults(std::ostream &outStream) const
Analyzes the tracked events and writes the results to the given output stream.
Definition: Profiling.cpp:604
Private implementation of INetwork.
Definition: Network.hpp:31
virtual void SetTensorInfo(const TensorInfo &tensorInfo)=0
IProfiler * GetProfiler()
Definition: Profiling.cpp:584
A tensor defined by a TensorInfo (shape and data type) and a mutable backing store.
Definition: Tensor.hpp:319
uint32_t m_PadTop
Padding top value in the height dimension.
TEST_CASE_FIXTURE(ClContextControlFixture, "CopyBetweenNeonAndGpu")
uint32_t m_StrideX
Stride value when proceeding through input for the width dimension.
IConnectableLayer * AddInputLayer(LayerBindingId id, const char *name=nullptr)
Definition: Network.cpp:1920
IOptimizedNetworkPtr Optimize(const INetwork &network, const std::vector< BackendId > &backendPreferences, const IDeviceSpec &deviceSpec, const OptimizerOptions &options=OptimizerOptions(), Optional< std::vector< std::string > &> messages=EmptyOptional())
Create an optimized version of the network.
Definition: Network.cpp:1864
IConnectableLayer * AddConvertFp16ToFp32Layer(const char *name=nullptr)
Definition: Network.cpp:2079
int NetworkId
Definition: IRuntime.hpp:27
A tensor defined by a TensorInfo (shape and data type) and an immutable backing store.
Definition: Tensor.hpp:327
std::vector< std::pair< LayerBindingId, class Tensor > > OutputTensors
Definition: Tensor.hpp:393
std::unique_ptr< IOptimizedNetwork, void(*)(IOptimizedNetwork *network)> IOptimizedNetworkPtr
Definition: INetwork.hpp:239
IConnectableLayer * AddOutputLayer(LayerBindingId id, const char *name=nullptr)
Definition: Network.cpp:2224
#define ARMNN_ASSERT(COND)
Definition: Assert.hpp:14
GPU Execution: OpenCL: ArmCompute.
ArmNN performs an optimization on each model/network before it gets loaded for execution.
Definition: INetwork.hpp:127
An ActivationDescriptor for the ActivationLayer.
Definition: Descriptors.hpp:36
This factory creates ClImportTensorHandles that refer to imported memory tensors. ...
uint32_t m_StrideY
Stride value when proceeding through input for the height dimension.
MemorySource
Define the Memory Source to reduce copies.
Definition: Types.hpp:230
const Graph & GetGraph() const
Definition: Network.hpp:37
virtual const IInputSlot & GetInputSlot(unsigned int index) const =0
Get a const input slot handle by slot index.
void SetConstant(const bool IsConstant=true)
Marks the data corresponding to this tensor info as constant.
Definition: Tensor.cpp:514
virtual const IOutputSlot & GetOutputSlot(unsigned int index) const =0
Get the const output slot handle by slot index.
std::unique_ptr< INetwork, void(*)(INetwork *network)> INetworkPtr
Definition: INetwork.hpp:238
virtual int Connect(IInputSlot &destination)=0
half_float::half Half
Definition: Half.hpp:18
ActivationFunction m_Function
The activation function to use (Sigmoid, TanH, Linear, ReLu, BoundedReLu, SoftReLu, LeakyReLu, Abs, Sqrt, Square, Elu).
Definition: Descriptors.hpp:59
uint32_t m_PadLeft
Padding left value in the width dimension.
unsigned int GetNumElements() const
Definition: Tensor.hpp:196