diff options
author | Narumol Prangnawarat <narumol.prangnawarat@arm.com> | 2022-01-28 17:59:18 +0000 |
---|---|---|
committer | Jim Flynn <jim.flynn@arm.com> | 2022-01-31 12:53:51 +0000 |
commit | e2af6f4322a1e2b8b3c391fb721a6a80c281477f (patch) | |
tree | b0dd53289e27304a6d724821459cb0f4b6343a39 /src/backends | |
parent | fd313fef775ed210f8dab84452ea382a0b4164b0 (diff) | |
download | armnn-e2af6f4322a1e2b8b3c391fb721a6a80c281477f.tar.gz |
IVGCVSW-6552 Add support of aligned host memory
* Add AllocatedData functions to OutputHandler
* Enable import aligned memory in ImportInputs
* Enable import aligned memory in ImportOutputs
* Allow to import input and output if the memory is aligned
* Implement Reconfigure function on ClConvolution2dWorkload
* End-to-end test on Ref and Cl to ensure that input and output memory
are imported when aligned
Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: I9e5e4c26d1ac2f1d806803ade5f64c6479c51718
Diffstat (limited to 'src/backends')
-rw-r--r-- | src/backends/cl/ClBackend.cpp | 18 | ||||
-rw-r--r-- | src/backends/cl/ClImportTensorHandle.hpp | 12 | ||||
-rw-r--r-- | src/backends/cl/test/ClCreateWorkloadTests.cpp | 61 | ||||
-rw-r--r-- | src/backends/cl/test/ClImportTensorHandleTests.cpp | 153 | ||||
-rw-r--r-- | src/backends/cl/workloads/ClBaseWorkload.hpp | 25 | ||||
-rw-r--r-- | src/backends/cl/workloads/ClConvolution2dWorkload.cpp | 17 | ||||
-rw-r--r-- | src/backends/cl/workloads/ClConvolution2dWorkload.hpp | 8 | ||||
-rw-r--r-- | src/backends/neon/workloads/NeonBaseWorkload.hpp | 25 | ||||
-rw-r--r-- | src/backends/reference/test/RefEndToEndTests.cpp | 87 |
9 files changed, 398 insertions, 8 deletions
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index 8abb16ccca..0fc5da78d1 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -133,6 +133,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) const { + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined)) + { + inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc); + } + if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined)) + { + outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc); + } std::shared_ptr<ClMemoryManager> memoryManager; if (m_UsingCustomAllocator) { @@ -193,6 +202,15 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) { + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined)) + { + inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc); + } + if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined)) + { + outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc); + } std::shared_ptr<ClMemoryManager> memoryManager; if (m_UsingCustomAllocator) { diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp index a236a70d7c..54710d8135 100644 --- a/src/backends/cl/ClImportTensorHandle.hpp +++ b/src/backends/cl/ClImportTensorHandle.hpp @@ -205,7 +205,11 @@ public: // We do this to match the behaviour of the Import function later on. auto cachelineAlignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); - auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + auto roundedSize = totalBytes; + if (totalBytes % cachelineAlignment != 0) + { + roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + } cl_int error = CL_SUCCESS; cl_mem buffer; @@ -252,7 +256,11 @@ private: // This does not change the size of the buffer, only the size of the mapping the buffer is mapped to auto cachelineAlignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); - auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + auto roundedSize = totalBytes; + if (totalBytes % cachelineAlignment != 0) + { + roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + } cl_int error = CL_SUCCESS; cl_mem buffer; diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp index d8b2d4f786..4a28205ade 100644 --- a/src/backends/cl/test/ClCreateWorkloadTests.cpp +++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp @@ -11,11 +11,14 @@ #include <armnn/utility/PolymorphicDowncast.hpp> #include <armnn/backends/MemCopyWorkload.hpp> #include <armnnTestUtils/TensorCopyUtils.hpp> +#include <TensorHelpers.hpp> #include <armnnTestUtils/WorkloadTestUtils.hpp> #include <aclCommon/test/CreateWorkloadClNeon.hpp> #include <aclCommon/ArmComputeTensorUtils.hpp> +#include <cl/ClImportTensorHandle.hpp> +#include <cl/ClImportTensorHandleFactory.hpp> #include <cl/ClTensorHandle.hpp> #include <cl/ClWorkloadFactory.hpp> #include <cl/workloads/ClWorkloads.hpp> @@ -355,6 +358,64 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dFastMathEnabledWo ARMNN_ASSERT(conv2dWorkload->GetConvolutionMethod() == arm_compute::ConvolutionMethod::WINOGRAD); } +TEST_CASE_FIXTURE(ClContextControlFixture, "ClReplaceInputOutputConvolution2dWorkload") +{ + // Create Convolution2dWorkload with ClTensorHandle input and output + // Then replace the input and output with ClImportTensorHandle + Graph graph; + ClWorkloadFactory factory = + ClWorkloadFactoryHelper::GetFactory(ClWorkloadFactoryHelper::GetMemoryManager()); + + auto workload = + CreateConvolution2dWorkloadTest<ClConvolution2dWorkload, DataType::Float32>(factory, + graph, + DataLayout::NHWC); + + TensorShape inputShape = std::initializer_list<unsigned int>({2, 8, 16, 3}); + TensorShape outputShape = std::initializer_list<unsigned int>({2, 2, 10, 2}); + + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). + Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Inputs[0]); + auto outputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Outputs[0]); + CHECK((inputHandle->GetShape() == inputShape)); + CHECK((outputHandle->GetShape() == outputShape)); + // The input and output handles are created correctly as ClTensorHandle + CHECK((dynamic_cast<ClTensorHandle*>(inputHandle) != nullptr)); + CHECK((dynamic_cast<ClTensorHandle*>(outputHandle) != nullptr)); + + // Replace with ImportTensorHandle + ClImportTensorHandleFactory importFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc), + static_cast<MemorySourceFlags>(MemorySource::Malloc)); + + TensorInfo inputInfo({ 2, 8, 16, 3 }, DataType::Float32); + TensorInfo outputInfo({ 2, 2, 10, 2 }, DataType::Float32); + + // create TensorHandle for memory import + auto inputImportHandle = importFactory.CreateTensorHandle(inputInfo); + auto outputImportHandle = importFactory.CreateTensorHandle(outputInfo); + + // Calling ReplaceInputTensorHandle and ReplaceOutputTensorHandle does not throw exception + // as Reconfigure function is implemented + workload->ReplaceInputTensorHandle(inputImportHandle.get(), 0); + workload->ReplaceOutputTensorHandle(outputImportHandle.get(), 0); + + // Correctly replaced with the import handles with correct information + queueDescriptor = workload->GetData(); + auto replacedInputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Inputs[0]); + auto replacedOutputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Outputs[0]); + CHECK((replacedInputHandle->GetShape() == inputShape)); + CHECK((replacedOutputHandle->GetShape() == outputShape)); + + CHECK((inputImportHandle.get() == replacedInputHandle)); + CHECK((inputImportHandle.get() == replacedInputHandle)); + + CHECK((dynamic_cast<ClTensorHandle*>(replacedInputHandle) == nullptr)); + CHECK((dynamic_cast<ClImportTensorHandle*>(replacedInputHandle) != nullptr)); + CHECK((dynamic_cast<ClTensorHandle*>(replacedOutputHandle) == nullptr)); + CHECK((dynamic_cast<ClImportTensorHandle*>(replacedOutputHandle) != nullptr)); +} + TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dClCompiledContextWorkload") { using namespace armnn; diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp index 3d702642aa..161765484d 100644 --- a/src/backends/cl/test/ClImportTensorHandleTests.cpp +++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp @@ -274,4 +274,157 @@ TEST_CASE("ClCanBeImportedAlignedMemory") // we can be confident that it will be successfully imported. All other cases will need to be handled by the user. } +TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd") +{ + // Create runtime in which test will run + IRuntime::CreationOptions options; + IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + // build up the structure of the network + INetworkPtr network(INetwork::Create()); + + armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32); + armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32); + armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32); + + kernelInfo.SetConstant(true); + + std::vector<float> kernel = + { + 4, 5, 6, + 0, 0, 0, + 3, 2, 1 + }; + + const std::vector<float> expectedOutput = + { + 23, 41, 33, 21, + 44, 65, 76, 52, + 82, 85, 79, 42 + }; + + unsigned int numElements = inputInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input"); + ARMNN_ASSERT(inputLayer); + + armnn::ConstTensor weights(kernelInfo, kernel); + + armnn::Convolution2dDescriptor convDesc2d; + convDesc2d.m_StrideX = 1; + convDesc2d.m_StrideY = 1; + convDesc2d.m_PadLeft = 1; + convDesc2d.m_PadRight = 1; + convDesc2d.m_PadTop = 1; + convDesc2d.m_PadBottom = 1; + convDesc2d.m_DataLayout = DataLayout::NHWC; + armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, + weights, + armnn::EmptyOptional(), + "conv"); + ARMNN_ASSERT(convLayer); + + inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); + inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); + + IConnectableLayer* output = network->AddOutputLayer(0, "output"); + convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); + + // Optimize the network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = false; + std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; + IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions); + CHECK(optNet); + + // Loads it into the runtime. + NetworkId netId; + std::string ignoredErrorMessage; + // Enable Importing + INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); + runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + const size_t alignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); + size_t space = totalBytes + alignment + alignment; + auto inputData = std::make_unique<uint8_t[]>(space); + void* alignedInputPtr = inputData.get(); + CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); + + // Input with negative values + auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr); + inputPtr[0] = 1; + inputPtr[1] = 5; + inputPtr[2] = 2; + inputPtr[3] = 3; + inputPtr[4] = 8; + inputPtr[5] = 7; + inputPtr[6] = 3; + inputPtr[7] = 6; + inputPtr[8] = 3; + inputPtr[9] = 3; + inputPtr[10] = 9; + inputPtr[11] = 1; + + + auto outputData = std::make_unique<uint8_t[]>(space); + void* alignedOutputPtr = outputData.get(); + CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space)); + auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr); + std::fill_n(outputPtr, numElements, -10.0f); + + TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); + inputTensorInfo.SetConstant(true); + InputTensors inputTensors + { + {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, + }; + OutputTensors outputTensors + { + {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + INFO("Run ImportInputs"); + std::vector<ImportedInputId> importedInputIds = + runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); + std::vector<ImportedOutputId> importedOutputIds = + runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Contains Convolution2dWorkload + std::size_t found = dump.find("Convolution2dWorkload"); + CHECK(found != std::string::npos); + + // Contains SyncMemGeneric + found = dump.find("SyncMemGeneric"); + CHECK(found != std::string::npos); + + // Does not contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + CHECK(found == std::string::npos); + + runtime->UnloadNetwork(netId); + + // Check output is as expected + // Validate result by checking that the output has no negative values + auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr); + CHECK(outputResult); + + // Check the output is correct + CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); +} + } diff --git a/src/backends/cl/workloads/ClBaseWorkload.hpp b/src/backends/cl/workloads/ClBaseWorkload.hpp index e74fc84f4f..03417e33ae 100644 --- a/src/backends/cl/workloads/ClBaseWorkload.hpp +++ b/src/backends/cl/workloads/ClBaseWorkload.hpp @@ -20,17 +20,38 @@ public: // Replace input tensor handle with the given TensorHandle and call Reconfigure() void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot]; this->m_Data.m_Inputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } // Replace output tensor handle with the given TensorHandle and call Reconfigure() void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot]; this->m_Data.m_Outputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } +protected: // Reconfigure the workload configuration. Throw armnn::UnimplementedException by default. virtual void Reconfigure() { diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp index 705e92d307..cdfa885f67 100644 --- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp +++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp @@ -90,6 +90,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); + // Create Proxy tensor and set the initial tensor handle to it + m_InputProxy = std::make_unique<ICLTensorProxy>(&input); + m_OutputProxy = std::make_unique<ICLTensorProxy>(&output); + arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout); input.info()->set_data_layout(aclDataLayout); output.info()->set_data_layout(aclDataLayout); @@ -101,10 +105,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_configure"); m_ConvolutionLayer.configure(clCompileContext, - &input, + m_InputProxy.get(), m_KernelTensor.get(), m_BiasTensor.get(), - &output, + m_OutputProxy.get(), padStrideInfo, arm_compute::WeightsInfo(), aclDilationInfo, @@ -174,4 +178,13 @@ void ClConvolution2dWorkload::FreeUnusedTensors() FreeTensorIfUnused(m_BiasTensor); } +void ClConvolution2dWorkload::Reconfigure() +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_Reconfigure"); + arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); + m_InputProxy->set(&input); + m_OutputProxy->set(&output); +} + } //namespace armnn diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp index 8a4599df47..891d5096cd 100644 --- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp +++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp @@ -13,6 +13,8 @@ #include <arm_compute/runtime/CL/functions/CLConvolutionLayer.h> #include <arm_compute/runtime/MemoryManagerOnDemand.h> +#include <cl/ICLTensorProxy.hpp> + #include <memory> namespace armnn @@ -38,6 +40,9 @@ public: arm_compute::ConvolutionMethod GetConvolutionMethod() const; +protected: + void Reconfigure() override; + private: mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; @@ -47,6 +52,9 @@ private: arm_compute::ConvolutionMethod m_ConvolutionMethod; void FreeUnusedTensors(); + + std::unique_ptr<ICLTensorProxy> m_InputProxy; + std::unique_ptr<ICLTensorProxy> m_OutputProxy; }; } //namespace armnn diff --git a/src/backends/neon/workloads/NeonBaseWorkload.hpp b/src/backends/neon/workloads/NeonBaseWorkload.hpp index a92f35a173..63f3539164 100644 --- a/src/backends/neon/workloads/NeonBaseWorkload.hpp +++ b/src/backends/neon/workloads/NeonBaseWorkload.hpp @@ -20,17 +20,38 @@ public: // Replace input tensor handle with the given TensorHandle and call Reconfigure() void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot]; this->m_Data.m_Inputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } // Replace output tensor handle with the given TensorHandle and call Reconfigure() void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot]; this->m_Data.m_Outputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } +protected: // Reconfigure the workload configuration. Throw armnn::UnimplementedException by default. virtual void Reconfigure() { diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp index 4444f5c361..7a6cf97936 100644 --- a/src/backends/reference/test/RefEndToEndTests.cpp +++ b/src/backends/reference/test/RefEndToEndTests.cpp @@ -1378,6 +1378,93 @@ TEST_CASE("RefRankEndToEndTestQSymmS8") RankEndToEnd<armnn::DataType::QSymmS8>(defaultBackends); } +TEST_CASE("RefForceImportTest") +{ + using namespace armnn; + + std::vector<BackendId> backends = defaultBackends; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input = net->AddInputLayer(0); + + ActivationDescriptor descriptor; + descriptor.m_Function = ActivationFunction::Square; + IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor); + + IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0)); + activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true)); + activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32)); + + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + + // Load it into the runtime. It should pass. + NetworkId netId; + std::string ignoredErrorMessage; + + INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); + + CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties) + == Status::Success); + + // Creates structures for input & output + std::vector<float> inputData + { + 1.0f, 2.0f, 3.0f, 4.0f + }; + + std::vector<float> outputData(4); + + std::vector<float> expectedOutput + { + 1.0f, 4.0f, 9.0f, 16.0f + }; + + InputTensors inputTensors + { + {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}, + }; + OutputTensors outputTensors + { + {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + std::vector<ImportedInputId> importedInputIds = + runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); + std::vector<ImportedOutputId> importedOutputIds = + runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); + + // Do the inference and force the import as the memory is alligned. + runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Check there is a SyncMemGeneric workload as we exported + int count = SubStringCounter(dump, "SyncMemGeneric"); + CHECK(count == 1); + + // Shouldn't be any CopyMemGeneric workloads + count = SubStringCounter(dump, "CopyMemGeneric"); + CHECK(count == 0); + + // Check the output is correct + CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end())); +} + #if !defined(__ANDROID__) // Only run these tests on non Android platforms TEST_CASE("RefImportNonAlignedPointerTest") |