plain/23.11/_async_execution_sample_8cpp-example.html

//

// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.

// SPDX-License-Identifier: MIT

//

#include <armnn/INetwork.hpp>

#include <armnn/IRuntime.hpp>

#include <armnn/Utils.hpp>

#include <armnn/Descriptors.hpp>


#include <iostream>

#include <thread>


/// A simple example of using the ArmNN SDK API to run a network multiple times with different inputs in an asynchronous

/// manner.

///

/// Background info: The usual runtime->EnqueueWorkload, which is used to trigger the execution of a network, is not

///                  thread safe. Each workload has memory assigned to it which would be overwritten by each thread.

///                  Before we added support for this you had to load a network multiple times to execute it at the

///                  same time. Every time a network is loaded, it takes up memory on your device. Making the

///                  execution thread safe helps to reduce the memory footprint for concurrent executions significantly.

///                  This example shows you how to execute a model concurrently (multiple threads) while still only

///                  loading it once.

///

/// As in most of our simple samples, the network in this example will ask the user for a single input number for each

/// execution of the network.

/// The network consists of a single fully connected layer with a single neuron. The neurons weight is set to 1.0f

/// to produce an output number that is the same as the input.

int main()

{

    using namespace armnn;


    // The first part of this code is very similar to the SimpleSample.cpp you should check it out for comparison

    // The interesting part starts when the graph is loaded into the runtime


    std::vector<float> inputs;

    float number1;

    std::cout << "Please enter a number for the first iteration: " << std::endl;

    std::cin >> number1;

    float number2;

    std::cout << "Please enter a number for the second iteration: " << std::endl;

    std::cin >> number2;


    // Turn on logging to standard output

    // This is useful in this sample so that users can learn more about what is going on

    ConfigureLogging(true, false, LogSeverity::Warning);


    // Construct ArmNN network

    NetworkId networkIdentifier;

    INetworkPtr myNetwork = INetwork::Create();


    float weightsData[] = {1.0f}; // Identity

    TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32, 0.0f, 0, true);

    weightsInfo.SetConstant();

    ConstTensor weights(weightsInfo, weightsData);


    // Constant layer that now holds weights data for FullyConnected

    IConnectableLayer* const constantWeightsLayer = myNetwork->AddConstantLayer(weights, "const weights");


    FullyConnectedDescriptor fullyConnectedDesc;

    IConnectableLayer* const fullyConnectedLayer = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc,

                                                                                     "fully connected");

    IConnectableLayer* InputLayer  = myNetwork->AddInputLayer(0);

    IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0);


    InputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));

    constantWeightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1));

    fullyConnectedLayer->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));


    // Create ArmNN runtime

    IRuntime::CreationOptions options; // default options

    IRuntimePtr run = IRuntime::Create(options);


    //Set the tensors in the network.

    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);

    InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);


    TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);

    fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);

    constantWeightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);


    // Optimise ArmNN network

    IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {Compute::CpuRef}, run->GetDeviceSpec());

    if (!optNet)

    {

        // This shouldn't happen for this simple sample, with reference backend.

        // But in general usage Optimize could fail if the hardware at runtime cannot

        // support the model that has been provided.

        std::cerr << "Error: Failed to optimise the input network." << std::endl;

        return 1;

    }


    // Load graph into runtime.

    std::string errmsg; // To hold an eventual error message if loading the network fails

    // Add network properties to enable async execution. The MemorySource::Undefined variables indicate

    // that neither inputs nor outputs will be imported. Importing will be covered in another example.

    armnn::INetworkProperties networkProperties(true, MemorySource::Undefined, MemorySource::Undefined);

    run->LoadNetwork(networkIdentifier,

                     std::move(optNet),

                     errmsg,

                     networkProperties);


    // Creates structures for inputs and outputs. A vector of float for each execution.

    std::vector<std::vector<float>> inputData{{number1}, {number2}};

    std::vector<std::vector<float>> outputData;

    outputData.resize(2, std::vector<float>(1));


    inputTensorInfo = run->GetInputTensorInfo(networkIdentifier, 0);

    inputTensorInfo.SetConstant(true);

    std::vector<InputTensors> inputTensors

    {

        {{0, armnn::ConstTensor(inputTensorInfo, inputData[0].data())}},

        {{0, armnn::ConstTensor(inputTensorInfo, inputData[1].data())}}

    };

    std::vector<OutputTensors> outputTensors

    {

        {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[0].data())}},

        {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[1].data())}}

    };


    // Lambda function to execute the network. We use it as thread function.

    auto execute = [&](unsigned int executionIndex)

    {

        auto memHandle = run->CreateWorkingMemHandle(networkIdentifier);

        run->Execute(*memHandle, inputTensors[executionIndex], outputTensors[executionIndex]);

    };


    // Prepare some threads and let each execute the network with a different input

    std::vector<std::thread> threads;

    for (unsigned int i = 0; i < inputTensors.size(); ++i)

    {

        threads.emplace_back(std::thread(execute, i));

    }


    // Wait for the threads to finish

    for (std::thread& t : threads)

    {

        if(t.joinable())

        {

            t.join();

        }

    }


    std::cout << "Your numbers were " << outputData[0][0] << " and " << outputData[1][0] << std::endl;

    return 0;


}