samples/AsyncExecutionSample.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

//
// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include <armnn/INetwork.hpp>
#include <armnn/IRuntime.hpp>
#include <armnn/Utils.hpp>
#include <armnn/Descriptors.hpp>

#include <iostream>
#include <thread>

/// A simple example of using the ArmNN SDK API to run a network multiple times with different inputs in an asynchronous
/// manner.
///
/// Background info: The usual runtime->EnqueueWorkload, which is used to trigger the execution of a network, is not
///                  thread safe. Each workload has memory assigned to it which would be overwritten by each thread.
///                  Before we added support for this you had to load a network multiple times to execute it at the
///                  same time. Every time a network is loaded, it takes up memory on your device. Making the
///                  execution thread safe helps to reduce the memory footprint for concurrent executions significantly.
///                  This example shows you how to execute a model concurrently (multiple threads) while still only
///                  loading it once.
///
/// As in most of our simple samples, the network in this example will ask the user for a single input number for each
/// execution of the network.
/// The network consists of a single fully connected layer with a single neuron. The neurons weight is set to 1.0f
/// to produce an output number that is the same as the input.
int main()
{
    using namespace armnn;

    // The first part of this code is very similar to the SimpleSample.cpp you should check it out for comparison
    // The interesting part starts when the graph is loaded into the runtime

    std::vector<float> inputs;
    float number1;
    std::cout << "Please enter a number for the first iteration: " << std::endl;
    std::cin >> number1;
    float number2;
    std::cout << "Please enter a number for the second iteration: " << std::endl;
    std::cin >> number2;

    // Turn on logging to standard output
    // This is useful in this sample so that users can learn more about what is going on
    ConfigureLogging(true, false, LogSeverity::Warning);

    // Construct ArmNN network
    NetworkId networkIdentifier;
    INetworkPtr myNetwork = INetwork::Create();

    float weightsData[] = {1.0f}; // Identity
    TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32, 0.0f, 0, true);
    weightsInfo.SetConstant();
    ConstTensor weights(weightsInfo, weightsData);

    // Constant layer that now holds weights data for FullyConnected
    IConnectableLayer* const constantWeightsLayer = myNetwork->AddConstantLayer(weights, "const weights");

    FullyConnectedDescriptor fullyConnectedDesc;
    IConnectableLayer* const fullyConnectedLayer = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc,
                                                                                     "fully connected");
    IConnectableLayer* InputLayer  = myNetwork->AddInputLayer(0);
    IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0);

    InputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));
    constantWeightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1));
    fullyConnectedLayer->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));

    // Create ArmNN runtime
    IRuntime::CreationOptions options; // default options
    IRuntimePtr run = IRuntime::Create(options);

    //Set the tensors in the network.
    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
    InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);

    TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
    fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
    constantWeightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);

    // Optimise ArmNN network
    IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {Compute::CpuRef}, run->GetDeviceSpec());
    if (!optNet)
    {
        // This shouldn't happen for this simple sample, with reference backend.
        // But in general usage Optimize could fail if the hardware at runtime cannot
        // support the model that has been provided.
        std::cerr << "Error: Failed to optimise the input network." << std::endl;
        return 1;
    }

    // Load graph into runtime.
    std::string errmsg; // To hold an eventual error message if loading the network fails
    // Add network properties to enable async execution. The MemorySource::Undefined variables indicate
    // that neither inputs nor outputs will be imported. Importing will be covered in another example.
    armnn::INetworkProperties networkProperties(true, MemorySource::Undefined, MemorySource::Undefined);
    run->LoadNetwork(networkIdentifier,
                     std::move(optNet),
                     errmsg,
                     networkProperties);

    // Creates structures for inputs and outputs. A vector of float for each execution.
    std::vector<std::vector<float>> inputData{{number1}, {number2}};
    std::vector<std::vector<float>> outputData;
    outputData.resize(2, std::vector<float>(1));

    inputTensorInfo = run->GetInputTensorInfo(networkIdentifier, 0);
    inputTensorInfo.SetConstant(true);
    std::vector<InputTensors> inputTensors
    {
        {{0, armnn::ConstTensor(inputTensorInfo, inputData[0].data())}},
        {{0, armnn::ConstTensor(inputTensorInfo, inputData[1].data())}}
    };
    std::vector<OutputTensors> outputTensors
    {
        {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[0].data())}},
        {{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData[1].data())}}
    };

    // Lambda function to execute the network. We use it as thread function.
    auto execute = [&](unsigned int executionIndex)
    {
        auto memHandle = run->CreateWorkingMemHandle(networkIdentifier);
        run->Execute(*memHandle, inputTensors[executionIndex], outputTensors[executionIndex]);
    };

    // Prepare some threads and let each execute the network with a different input
    std::vector<std::thread> threads;
    for (unsigned int i = 0; i < inputTensors.size(); ++i)
    {
        threads.emplace_back(std::thread(execute, i));
    }

    // Wait for the threads to finish
    for (std::thread& t : threads)
    {
        if(t.joinable())
        {
            t.join();
        }
    }

    std::cout << "Your numbers were " << outputData[0][0] << " and " << outputData[1][0] << std::endl;
    return 0;

}