// // Copyright © 2021 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include #include #include #include #include #include /// A simple example of using the ArmNN SDK API to run a network multiple times with different inputs in an asynchronous /// manner. /// /// Background info: The usual runtime->EnqueueWorkload, which is used to trigger the execution of a network, is not /// thread safe. Each workload has memory assigned to it which would be overwritten by each thread. /// Before we added support for this you had to load a network multiple times to execute it at the /// same time. Every time a network is loaded, it takes up memory on your device. Making the /// execution thread safe helps to reduce the memory footprint for concurrent executions significantly. /// This example shows you how to execute a model concurrently (multiple threads) while still only /// loading it once. /// /// As in most of our simple samples, the network in this example will ask the user for a single input number for each /// execution of the network. /// The network consists of a single fully connected layer with a single neuron. The neurons weight is set to 1.0f /// to produce an output number that is the same as the input. int main() { using namespace armnn; // The first part of this code is very similar to the SimpleSample.cpp you should check it out for comparison // The interesting part starts when the graph is loaded into the runtime std::vector inputs; float number1; std::cout << "Please enter a number for the first iteration: " << std::endl; std::cin >> number1; float number2; std::cout << "Please enter a number for the second iteration: " << std::endl; std::cin >> number2; // Turn on logging to standard output // This is useful in this sample so that users can learn more about what is going on ConfigureLogging(true, false, LogSeverity::Warning); // Construct ArmNN network NetworkId networkIdentifier; INetworkPtr myNetwork = INetwork::Create(); float weightsData[] = {1.0f}; // Identity TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32, 0.0f, 0, true); weightsInfo.SetConstant(); ConstTensor weights(weightsInfo, weightsData); // Constant layer that now holds weights data for FullyConnected IConnectableLayer* const constantWeightsLayer = myNetwork->AddConstantLayer(weights, "const weights"); FullyConnectedDescriptor fullyConnectedDesc; IConnectableLayer* const fullyConnectedLayer = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc, "fully connected"); IConnectableLayer* InputLayer = myNetwork->AddInputLayer(0); IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0); InputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0)); constantWeightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1)); fullyConnectedLayer->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0)); // Create ArmNN runtime IRuntime::CreationOptions options; // default options IRuntimePtr run = IRuntime::Create(options); //Set the tensors in the network. TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32); fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); constantWeightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo); // Optimise ArmNN network IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {Compute::CpuRef}, run->GetDeviceSpec()); if (!optNet) { // This shouldn't happen for this simple sample, with reference backend. // But in general usage Optimize could fail if the hardware at runtime cannot // support the model that has been provided. std::cerr << "Error: Failed to optimise the input network." << std::endl; return 1; } // Load graph into runtime. std::string errmsg; // To hold an eventual error message if loading the network fails // Add network properties to enable async execution. The MemorySource::Undefined variables indicate // that neither inputs nor outputs will be imported. Importing will be covered in another example. armnn::INetworkProperties networkProperties(true, MemorySource::Undefined, MemorySource::Undefined); run->LoadNetwork(networkIdentifier, std::move(optNet), errmsg, networkProperties); // Creates structures for inputs and outputs. A vector of float for each execution. std::vector> inputData{{number1}, {number2}}; std::vector> outputData; outputData.resize(2, std::vector(1)); inputTensorInfo = run->GetInputTensorInfo(networkIdentifier, 0); inputTensorInfo.SetConstant(true); std::vector inputTensors { {{0, armnn::ConstTensor(inputTensorInfo, inputData[0].data())}}, {{0, armnn::ConstTensor(inputTensorInfo, inputData[1].data())}} }; std::vector outputTensors { {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[0].data())}}, {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[1].data())}} }; // Lambda function to execute the network. We use it as thread function. auto execute = [&](unsigned int executionIndex) { ARMNN_NO_DEPRECATE_WARN_BEGIN auto memHandle = run->CreateWorkingMemHandle(networkIdentifier); run->Execute(*memHandle, inputTensors[executionIndex], outputTensors[executionIndex]); ARMNN_NO_DEPRECATE_WARN_END }; // Prepare some threads and let each execute the network with a different input std::vector threads; for (unsigned int i = 0; i < inputTensors.size(); ++i) { threads.emplace_back(std::thread(execute, i)); } // Wait for the threads to finish for (std::thread& t : threads) { if(t.joinable()) { t.join(); } } std::cout << "Your numbers were " << outputData[0][0] << " and " << outputData[1][0] << std::endl; return 0; }