aboutsummaryrefslogtreecommitdiff
path: root/ArmnnPreparedModel_1_2.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'ArmnnPreparedModel_1_2.cpp')
-rw-r--r--ArmnnPreparedModel_1_2.cpp281
1 files changed, 261 insertions, 20 deletions
diff --git a/ArmnnPreparedModel_1_2.cpp b/ArmnnPreparedModel_1_2.cpp
index a2148c29..a401b30e 100644
--- a/ArmnnPreparedModel_1_2.cpp
+++ b/ArmnnPreparedModel_1_2.cpp
@@ -1,19 +1,22 @@
//
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#define LOG_TAG "ArmnnDriver"
#include "ArmnnPreparedModel_1_2.hpp"
+
#include "Utils.hpp"
+#include <armnn/Types.hpp>
+
#include <log/log.h>
#include <OperationsUtils.h>
#include <ExecutionBurstServer.h>
#include <ValidateHal.h>
-#include <cassert>
+#include <chrono>
#include <cinttypes>
#ifdef ARMNN_ANDROID_S
@@ -124,6 +127,9 @@ RequestThread<ArmnnPreparedModel_1_2, HalVersion, CallbackContext_1_2>
ArmnnPreparedModel_1_2<HalVersion>::m_RequestThread;
template<typename HalVersion>
+std::unique_ptr<armnn::Threadpool> ArmnnPreparedModel_1_2<HalVersion>::m_Threadpool(nullptr);
+
+template<typename HalVersion>
template<typename TensorBindingCollection>
void ArmnnPreparedModel_1_2<HalVersion>::DumpTensorsIfRequired(char const* tensorNamePrefix,
const TensorBindingCollection& tensorBindings)
@@ -146,16 +152,88 @@ ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId netw
armnn::IRuntime* runtime,
const V1_2::Model& model,
const std::string& requestInputsAndOutputsDumpDir,
- const bool gpuProfilingEnabled)
+ const bool gpuProfilingEnabled,
+ const bool asyncModelExecutionEnabled,
+ const unsigned int numberOfThreads,
+ const bool importEnabled,
+ const bool exportEnabled)
: m_NetworkId(networkId)
, m_Runtime(runtime)
, m_Model(model)
, m_RequestCount(0)
, m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
, m_GpuProfilingEnabled(gpuProfilingEnabled)
+ , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
+ , m_EnableImport(importEnabled)
+ , m_EnableExport(exportEnabled)
+ , m_PreparedFromCache(false)
{
// Enable profiling if required.
m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
+
+ if (m_AsyncModelExecutionEnabled)
+ {
+ std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
+ for (unsigned int i=0; i < numberOfThreads; ++i)
+ {
+ memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
+ }
+
+ if (!m_Threadpool)
+ {
+ m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
+ }
+ else
+ {
+ m_Threadpool->LoadMemHandles(memHandles);
+ }
+
+ m_WorkingMemHandle = memHandles.back();
+ }
+}
+
+template<typename HalVersion>
+ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId networkId,
+ armnn::IRuntime* runtime,
+ const std::string& requestInputsAndOutputsDumpDir,
+ const bool gpuProfilingEnabled,
+ const bool asyncModelExecutionEnabled,
+ const unsigned int numberOfThreads,
+ const bool importEnabled,
+ const bool exportEnabled,
+ const bool preparedFromCache)
+ : m_NetworkId(networkId)
+ , m_Runtime(runtime)
+ , m_RequestCount(0)
+ , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
+ , m_GpuProfilingEnabled(gpuProfilingEnabled)
+ , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
+ , m_EnableImport(importEnabled)
+ , m_EnableExport(exportEnabled)
+ , m_PreparedFromCache(preparedFromCache)
+{
+ // Enable profiling if required.
+ m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
+
+ if (m_AsyncModelExecutionEnabled)
+ {
+ std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
+ for (unsigned int i=0; i < numberOfThreads; ++i)
+ {
+ memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
+ }
+
+ if (!m_Threadpool)
+ {
+ m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
+ }
+ else
+ {
+ m_Threadpool->LoadMemHandles(memHandles);
+ }
+
+ m_WorkingMemHandle = memHandles.back();
+ }
}
template<typename HalVersion>
@@ -163,12 +241,21 @@ ArmnnPreparedModel_1_2<HalVersion>::~ArmnnPreparedModel_1_2()
{
// Get a hold of the profiler used by this model.
std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkId);
+ if (profiler && m_GpuProfilingEnabled)
+ {
+ // Dump the profiling info to a file if required.
+ DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId,
+ profiler.get());
+ }
// Unload the network associated with this model.
m_Runtime->UnloadNetwork(m_NetworkId);
- // Dump the profiling info to a file if required.
- DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId, profiler.get());
+ // Unload the network memhandles from the threadpool
+ if (m_AsyncModelExecutionEnabled)
+ {
+ m_Threadpool->UnloadMemHandles(m_NetworkId);
+ }
}
template<typename HalVersion>
@@ -225,8 +312,20 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForIn
for (unsigned int i = 0; i < request.inputs.size(); i++)
{
const auto& inputArg = request.inputs[i];
+ armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
+ // inputs (of type InputTensors) is composed of a vector of ConstTensors.
+ // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
+ inputTensorInfo.SetConstant();
+ auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request,
+ inputTensorInfo,
+ inputArg,
+ "input");
+
+ if (result != V1_0::ErrorStatus::NONE)
+ {
+ return result;
+ }
- const armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools);
if (inputTensor.GetMemoryArea() == nullptr)
@@ -252,8 +351,17 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForOu
for (unsigned int i = 0; i < request.outputs.size(); i++)
{
const auto& outputArg = request.outputs[i];
+ armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
+ auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request,
+ outputTensorInfo,
+ outputArg,
+ "output");
+
+ if (result != V1_0::ErrorStatus::NONE)
+ {
+ return result;
+ }
- const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools);
if (outputTensor.GetMemoryArea() == nullptr)
{
@@ -277,7 +385,7 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForOu
return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
}
#else
- const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getMemory().size;
+ const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getSize();
if (bufferSize < outputSize)
{
ALOGW("ArmnnPreparedModel_1_2::Execute failed bufferSize (%s) < outputSize (%s)",
@@ -352,7 +460,10 @@ Return<void> ArmnnPreparedModel_1_2<HalVersion>::executeSynchronously(const V1_0
V1_2::MeasureTiming measureTiming,
executeSynchronously_cb cb)
{
- ALOGV("ArmnnPreparedModel_1_2::executeSynchronously(): %s", GetModelSummary(m_Model).c_str());
+ if (!m_PreparedFromCache)
+ {
+ ALOGV("ArmnnPreparedModel_1_2::executeSynchronously(): %s", GetModelSummary(m_Model).c_str());
+ }
m_RequestCount++;
if (cb == nullptr)
@@ -368,7 +479,7 @@ Return<void> ArmnnPreparedModel_1_2<HalVersion>::executeSynchronously(const V1_0
driverStart = Now();
}
- if (!android::nn::validateRequest(request, m_Model))
+ if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
{
ALOGE("ArmnnPreparedModel_1_2::executeSynchronously invalid request model");
cb(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming);
@@ -419,6 +530,8 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph(
ALOGV("ArmnnPreparedModel_1_2::ExecuteGraph(...)");
TimePoint driverEnd, deviceStart, deviceEnd;
+ // Capture the graph execution start time.
+ std::chrono::time_point<std::chrono::system_clock> graphExecutionStart = std::chrono::system_clock::now();
DumpTensorsIfRequired("Input", inputTensors);
@@ -440,7 +553,30 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph(
deviceStart = Now();
}
- armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors);
+ armnn::Status status;
+ if (m_AsyncModelExecutionEnabled)
+ {
+ ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled true");
+ status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors);
+ }
+ else
+ {
+ ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled false");
+
+ // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
+ std::vector<armnn::ImportedInputId> importedInputIds;
+ if (m_EnableImport)
+ {
+ importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
+ }
+ std::vector<armnn::ImportedOutputId> importedOutputIds;
+ if (m_EnableExport)
+ {
+ importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
+ }
+ status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
+ importedInputIds, importedOutputIds);
+ }
if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
{
@@ -477,24 +613,33 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph(
V1_2::Timing timing;
timing.timeOnDevice = MicrosecondsDuration(deviceEnd, deviceStart);
timing.timeInDriver = MicrosecondsDuration(driverEnd, cb.ctx.driverStart);
- ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu", timing.timeOnDevice,
- timing.timeInDriver);
+ ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu",
+ static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
cb.callback(V1_0::ErrorStatus::NONE, outputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph");
} else {
cb.callback(V1_0::ErrorStatus::NONE, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
}
+ // Log the total time in this call. This is a good number to compare to that printed out by
+ // RuntimeImpl::EnqueueWorkload. The difference should be the execution overhead of the driver.
+ ALOGI("ArmnnPreparedModel_1_2::ExecuteGraph Execution time = %lld µs",
+ std::chrono::duration_cast<std::chrono::microseconds>
+ (std::chrono::system_clock::now() - graphExecutionStart).count());
return true;
}
template<typename HalVersion>
-bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs()
+bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs)
{
std::vector<std::vector<char>> storage;
armnn::InputTensors inputTensors;
- for (unsigned int i = 0; i < getMainModel(m_Model).inputIndexes.size(); i++)
+ for (unsigned int i = 0; i < numInputs; i++)
{
- const armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
+ armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
+ // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors.
+ // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
+ inputTensorInfo.SetConstant();
+
storage.emplace_back(inputTensorInfo.GetNumBytes());
const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data());
@@ -502,7 +647,7 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs()
}
armnn::OutputTensors outputTensors;
- for (unsigned int i = 0; i < getMainModel(m_Model).outputIndexes.size(); i++)
+ for (unsigned int i = 0; i < numOutputs; i++)
{
const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
storage.emplace_back(outputTensorInfo.GetNumBytes());
@@ -534,10 +679,13 @@ Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::Execute(const V1_
ctx.driverStart = Now();
}
- ALOGV("ArmnnPreparedModel_1_2::execute(): %s", GetModelSummary(m_Model).c_str());
+ if (!m_PreparedFromCache)
+ {
+ ALOGV("ArmnnPreparedModel_1_2::execute(): %s", GetModelSummary(m_Model).c_str());
+ }
m_RequestCount++;
- if (!android::nn::validateRequest(request, m_Model))
+ if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
{
callback(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
return V1_0::ErrorStatus::INVALID_ARGUMENT;
@@ -567,12 +715,21 @@ Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::Execute(const V1_
{}
}
- ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg");
// post the request for asynchronous execution
CallbackContext_1_2 cb;
cb.callback = callback;
cb.ctx = ctx;
+
+ if (m_AsyncModelExecutionEnabled)
+ {
+ ALOGV("ArmnnPreparedModel_1_2::execute(...) before ScheduleGraphForExecution");
+ ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb);
+ ALOGV("ArmnnPreparedModel_1_2::execute(...) after ScheduleGraphForExecution");
+ return V1_0::ErrorStatus::NONE;
+ }
+
+ ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg");
m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb);
ALOGV("ArmnnPreparedModel_1_2::execute(...) after PostMsg");
return V1_0::ErrorStatus::NONE;
@@ -602,6 +759,84 @@ Return<void> ArmnnPreparedModel_1_2<HalVersion>::configureExecutionBurst(
return Void();
}
+/// Schedule the graph prepared from the request for execution
+template<typename HalVersion>
+template<typename CallbackContext>
+void ArmnnPreparedModel_1_2<HalVersion>::ScheduleGraphForExecution(
+ std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
+ std::shared_ptr<armnn::InputTensors>& inputTensors,
+ std::shared_ptr<armnn::OutputTensors>& outputTensors,
+ CallbackContext callbackContext)
+{
+ ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution(...)");
+
+ DumpTensorsIfRequired("Input", *inputTensors);
+
+ unsigned int outputTensorSize = outputTensors.get()->size();
+ std::vector<V1_2::OutputShape> outputShapes(outputTensorSize);
+ for (unsigned int i = 0; i < outputTensorSize; i++)
+ {
+ std::pair<int, armnn::Tensor> outputTensorPair = outputTensors.get()->at(i);
+ const armnn::Tensor outputTensor = outputTensorPair.second;
+ const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
+
+ outputShapes[i] = ComputeShape(outputTensorInfo);
+ }
+
+ auto tpCb = std::make_shared<
+ ArmnnThreadPoolCallback_1_2<CallbackContext_1_2>>(this,
+ pMemPools,
+ outputShapes,
+ inputTensors,
+ outputTensors,
+ callbackContext);
+
+ m_Threadpool->Schedule(m_NetworkId,
+ *tpCb->m_InputTensors,
+ *tpCb->m_OutputTensors,
+ armnn::QosExecPriority::Medium,
+ tpCb);
+ ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution end");
+}
+
+template<typename HalVersion>
+template <typename CallbackContext>
+void ArmnnPreparedModel_1_2<HalVersion>::ArmnnThreadPoolCallback_1_2<CallbackContext>::Notify(
+ armnn::Status status, armnn::InferenceTimingPair timeTaken)
+{
+ ALOGV("ArmnnPreparedModel_1_2::ArmnnThreadPoolCallback_1_2 Notify");
+
+ TimePoint driverEnd;
+
+ CommitPools(*m_MemPools);
+
+ m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors);
+
+ if (status != armnn::Status::Success)
+ {
+ ALOGW("ArmnnThreadPoolCallback::Notify EnqueueWorkload failed");
+ m_CallbackContext.callback(
+ V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel::ExecuteGraph");
+ return;
+ }
+
+ if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES)
+ {
+ driverEnd = std::chrono::steady_clock::now();
+ V1_2::Timing timing;
+ timing.timeOnDevice = MicrosecondsDuration(timeTaken.second, timeTaken.first);
+ timing.timeInDriver = MicrosecondsDuration(driverEnd, m_CallbackContext.ctx.driverStart);
+ ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu",
+ static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
+ m_CallbackContext.callback(
+ V1_0::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph");
+ } else {
+ m_CallbackContext.callback(
+ V1_0::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
+ }
+ return;
+}
+
#if defined(ARMNN_ANDROID_NN_V1_2) || defined(ARMNN_ANDROID_NN_V1_3)
template class ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>;
template bool ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ExecuteGraph<CallbackContext_1_2>(
@@ -609,6 +844,12 @@ template bool ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ExecuteGraph<CallbackC
armnn::InputTensors& pInputTensors,
armnn::OutputTensors& pOutputTensors,
CallbackContext_1_2 cb);
+
+template void ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ScheduleGraphForExecution<CallbackContext_1_2>(
+ std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
+ std::shared_ptr<armnn::InputTensors>& inputTensors,
+ std::shared_ptr<armnn::OutputTensors>& outputTensors,
+ CallbackContext_1_2 callbackContext);
#endif
} // namespace armnn_driver