From e38c418ebc434d6c2a5618388b0bd05963308047 Mon Sep 17 00:00:00 2001 From: Jan Eilers Date: Thu, 2 Sep 2021 13:12:11 +0100 Subject: Add sample app for asynchronous execution Signed-off-by: Jan Eilers Change-Id: I6d903c721d71a28bc02e4e98aaa813fb9159b678 --- samples/AsyncExecutionSample.cpp | 145 +++++++++++++++++++++++++++++++++++++++ samples/CMakeLists.txt | 3 + samples/examples.dox | 8 +++ 3 files changed, 156 insertions(+) create mode 100644 samples/AsyncExecutionSample.cpp (limited to 'samples') diff --git a/samples/AsyncExecutionSample.cpp b/samples/AsyncExecutionSample.cpp new file mode 100644 index 0000000000..6d2fe243dd --- /dev/null +++ b/samples/AsyncExecutionSample.cpp @@ -0,0 +1,145 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#include +#include +#include +#include + +#include +#include + +/// A simple example of using the ArmNN SDK API to run a network multiple times with different inputs in an asynchronous +/// manner. +/// +/// Background info: The usual runtime->EnqueueWorkload, which is used to trigger the execution of a network, is not +/// thread safe. Each workload has memory assigned to it which would be overwritten by each thread. +/// Before we added support for this you had to load a network multiple times to execute it at the +/// same time. Every time a network is loaded, it takes up memory on your device. Making the +/// execution thread safe helps to reduce the memory footprint for concurrent executions significantly. +/// This example shows you how to execute a model concurrently (multiple threads) while still only +/// loading it once. +/// +/// As in most of our simple samples, the network in this example will ask the user for a single input number for each +/// execution of the network. +/// The network consists of a single fully connected layer with a single neuron. The neurons weight is set to 1.0f +/// to produce an output number that is the same as the input. +int main() +{ + using namespace armnn; + + // The first part of this code is very similar to the SimpleSample.cpp you should check it out for comparison + // The interesting part starts when the graph is loaded into the runtime + + std::vector inputs; + float number1; + std::cout << "Please enter a number for the first iteration: " << std::endl; + std::cin >> number1; + float number2; + std::cout << "Please enter a number for the second iteration: " << std::endl; + std::cin >> number2; + + // Turn on logging to standard output + // This is useful in this sample so that users can learn more about what is going on + ConfigureLogging(true, false, LogSeverity::Warning); + + // Construct ArmNN network + NetworkId networkIdentifier; + INetworkPtr myNetwork = INetwork::Create(); + + float weightsData[] = {1.0f}; // Identity + TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32); + weightsInfo.SetConstant(); + ConstTensor weights(weightsInfo, weightsData); + + // Constant layer that now holds weights data for FullyConnected + IConnectableLayer* const constantWeightsLayer = myNetwork->AddConstantLayer(weights, "const weights"); + + FullyConnectedDescriptor fullyConnectedDesc; + IConnectableLayer* const fullyConnectedLayer = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc, + "fully connected"); + IConnectableLayer* InputLayer = myNetwork->AddInputLayer(0); + IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0); + + InputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0)); + constantWeightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1)); + fullyConnectedLayer->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0)); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + IRuntimePtr run = IRuntime::Create(options); + + //Set the tensors in the network. + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); + + TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32); + fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + constantWeightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo); + + // Optimise ArmNN network + IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {Compute::CpuRef}, run->GetDeviceSpec()); + if (!optNet) + { + // This shouldn't happen for this simple sample, with reference backend. + // But in general usage Optimize could fail if the hardware at runtime cannot + // support the model that has been provided. + std::cerr << "Error: Failed to optimise the input network." << std::endl; + return 1; + } + + // Load graph into runtime. + std::string errmsg; // To hold an eventual error message if loading the network fails + // Add network properties to enable async execution. The MemorySource::Undefined variables indicate + // that neither inputs nor outputs will be imported. Importing will be covered in another example. + armnn::INetworkProperties networkProperties(true, MemorySource::Undefined, MemorySource::Undefined); + run->LoadNetwork(networkIdentifier, + std::move(optNet), + errmsg, + networkProperties); + + // Creates structures for inputs and outputs. A vector of float for each execution. + std::vector> inputData{{number1}, {number2}}; + std::vector> outputData; + outputData.resize(2, std::vector(1)); + + + std::vector inputTensors + { + {{0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), inputData[0].data())}}, + {{0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), inputData[1].data())}} + }; + std::vector outputTensors + { + {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[0].data())}}, + {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[1].data())}} + }; + + // Lambda function to execute the network. We use it as thread function. + auto execute = [&](unsigned int executionIndex) + { + auto memHandle = run->CreateWorkingMemHandle(networkIdentifier); + run->Execute(*memHandle, inputTensors[executionIndex], outputTensors[executionIndex]); + }; + + // Prepare some threads and let each execute the network with a different input + std::vector threads; + for (unsigned int i = 0; i < inputTensors.size(); ++i) + { + threads.emplace_back(std::thread(execute, i)); + } + + // Wait for the threads to finish + for (std::thread& t : threads) + { + if(t.joinable()) + { + t.join(); + } + } + + std::cout << "Your numbers were " << outputData[0][0] << " and " << outputData[1][0] << std::endl; + return 0; + +} diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 7be6a69369..7af8b7265a 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,6 +1,9 @@ if(BUILD_SAMPLE_APP AND ARMNNREF) add_executable(SimpleSample SimpleSample.cpp) target_link_libraries(SimpleSample armnn ${CMAKE_THREAD_LIBS_INIT}) + + add_executable(AsyncExecutionSample AsyncExecutionSample.cpp) + target_link_libraries(AsyncExecutionSample armnn ${CMAKE_THREAD_LIBS_INIT}) endif() if(BUILD_SAMPLE_APP AND SAMPLE_DYNAMIC_BACKEND) diff --git a/samples/examples.dox b/samples/examples.dox index e0b0ea345e..4a41e30a48 100644 --- a/samples/examples.dox +++ b/samples/examples.dox @@ -38,4 +38,12 @@ memory for the inputs, outputs and inter layer memory. @example CustomMemoryAllocatorSample.cpp **/ +/** +Yet another variant of the SimpleSample application. In this little sample app you will be shown how to run a +network multiple times asynchronously. + +@note This is currently an experimental interface +@example AsyncExecutionSample.cpp +**/ + } -- cgit v1.2.1