From 558a1d4ed904f5f7d04781bc3405ee77669563d0 Mon Sep 17 00:00:00 2001 From: Narumol Prangnawarat Date: Mon, 7 Feb 2022 13:12:24 +0000 Subject: Revert "Revert "IVGCVSW-6700 Enable import aligned host memory in android-nn-driver"" This reverts commit 8069603dc44b7673b356f66517cd8b25af8080f0. * Reason for revert: Try reenable import aligned host memory in android-nn-driver * Added a check to ArmNNDriverImpl.cpp to not call ExecuteWithDummyInputs with GpuAcc * Added new android-nn-driver driver options to enable / disable Import and Export * Import is disabled by default for now due to conv2d issues * Export is enabled by default !armnn:7147 Change-Id: I91110c58ebb3931d1c458e3774944e55c1250dd8 Signed-off-by: David Monahan --- 1.2/ArmnnDriverImpl.cpp | 6 +++++- 1.3/ArmnnDriverImpl.cpp | 6 +++++- ArmnnDriverImpl.cpp | 40 +++++++++++++++++++++++----------------- ArmnnPreparedModel.cpp | 36 +++++++++++++++++++++++++++++++++--- ArmnnPreparedModel.hpp | 6 +++++- ArmnnPreparedModel_1_2.cpp | 27 +++++++++++++++++++++++++-- ArmnnPreparedModel_1_2.hpp | 8 +++++++- ArmnnPreparedModel_1_3.cpp | 26 ++++++++++++++++++++++++-- ArmnnPreparedModel_1_3.hpp | 8 +++++++- DriverOptions.cpp | 14 +++++++++++++- DriverOptions.hpp | 4 ++++ test/Concurrent.cpp | 5 +++-- test/DriverTestHelpers.hpp | 4 +++- 13 files changed, 157 insertions(+), 33 deletions(-) diff --git a/1.2/ArmnnDriverImpl.cpp b/1.2/ArmnnDriverImpl.cpp index 3274a8ab..1c31384a 100644 --- a/1.2/ArmnnDriverImpl.cpp +++ b/1.2/ArmnnDriverImpl.cpp @@ -267,7 +267,9 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_2( options.GetRequestInputsAndOutputsDumpDir(), options.IsGpuProfilingEnabled(), options.isAsyncModelExecutionEnabled(), - options.getNoOfArmnnThreads())); + options.getNoOfArmnnThreads(), + options.isImportEnabled(), + options.isExportEnabled())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. @@ -630,6 +632,8 @@ Return ArmnnDriverImpl::prepareModelFromCache( options.IsGpuProfilingEnabled(), options.isAsyncModelExecutionEnabled(), options.getNoOfArmnnThreads(), + options.isImportEnabled(), + options.isExportEnabled(), true)); NotifyCallbackAndCheck(cb, V1_0::ErrorStatus::NONE, preparedModel.release()); diff --git a/1.3/ArmnnDriverImpl.cpp b/1.3/ArmnnDriverImpl.cpp index c8b1d968..474e1c1f 100644 --- a/1.3/ArmnnDriverImpl.cpp +++ b/1.3/ArmnnDriverImpl.cpp @@ -281,7 +281,9 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_3( options.IsGpuProfilingEnabled(), priority, options.isAsyncModelExecutionEnabled(), - options.getNoOfArmnnThreads())); + options.getNoOfArmnnThreads(), + options.isImportEnabled(), + options.isExportEnabled())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. @@ -645,6 +647,8 @@ Return ArmnnDriverImpl::prepareModelFromCache_1_3( V1_3::Priority::MEDIUM, options.isAsyncModelExecutionEnabled(), options.getNoOfArmnnThreads(), + options.isImportEnabled(), + options.isExportEnabled(), true)); NotifyCallbackAndCheck(cb, V1_3::ErrorStatus::NONE, preparedModel.release()); diff --git a/ArmnnDriverImpl.cpp b/ArmnnDriverImpl.cpp index 0b3b9191..89fa54fc 100644 --- a/ArmnnDriverImpl.cpp +++ b/ArmnnDriverImpl.cpp @@ -202,30 +202,36 @@ Return ArmnnDriverImpl::prepareModel( options.GetRequestInputsAndOutputsDumpDir(), options.IsGpuProfilingEnabled(), options.isAsyncModelExecutionEnabled(), - options.getNoOfArmnnThreads())); + options.getNoOfArmnnThreads(), + options.isImportEnabled(), + options.isExportEnabled())); - // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if - // this is enabled) before the first 'real' inference which removes the overhead of the first inference. - if (!preparedModel->ExecuteWithDummyInputs()) + if (std::find(options.GetBackends().begin(), + options.GetBackends().end(), + armnn::Compute::GpuAcc) != options.GetBackends().end()) { - return FailPrepareModel(V1_0::ErrorStatus::GENERAL_FAILURE, "Network could not be executed", cb); - } - - if (clTunedParameters && - options.GetClTunedParametersMode() == armnn::IGpuAccTunedParameters::Mode::UpdateTunedParameters) - { - // Now that we've done one inference the CL kernel parameters will have been tuned, so save the updated file. - try + // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if + // this is enabled) before the first 'real' inference which removes the overhead of the first inference. + if (!preparedModel->ExecuteWithDummyInputs()) { - clTunedParameters->Save(options.GetClTunedParametersFile().c_str()); + return FailPrepareModel(V1_0::ErrorStatus::GENERAL_FAILURE, "Network could not be executed", cb); } - catch (std::exception& error) + + if (clTunedParameters && + options.GetClTunedParametersMode() == armnn::IGpuAccTunedParameters::Mode::UpdateTunedParameters) { - ALOGE("ArmnnDriverImpl::prepareModel: Failed to save CL tuned parameters file '%s': %s", - options.GetClTunedParametersFile().c_str(), error.what()); + // Now that we've done one inference the CL kernel parameters will have been tuned, so save the updated file + try + { + clTunedParameters->Save(options.GetClTunedParametersFile().c_str()); + } + catch (std::exception& error) + { + ALOGE("ArmnnDriverImpl::prepareModel: Failed to save CL tuned parameters file '%s': %s", + options.GetClTunedParametersFile().c_str(), error.what()); + } } } - NotifyCallbackAndCheck(cb, V1_0::ErrorStatus::NONE, preparedModel); return V1_0::ErrorStatus::NONE; diff --git a/ArmnnPreparedModel.cpp b/ArmnnPreparedModel.cpp index 38f1bc20..326351c0 100644 --- a/ArmnnPreparedModel.cpp +++ b/ArmnnPreparedModel.cpp @@ -8,6 +8,8 @@ #include "ArmnnPreparedModel.hpp" #include "Utils.hpp" +#include + #include #include #include @@ -116,7 +118,9 @@ ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, const bool asyncModelExecutionEnabled, - const unsigned int numberOfThreads) + const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) @@ -124,6 +128,8 @@ ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) { // Enable profiling if required. m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); @@ -308,7 +314,19 @@ void ArmnnPreparedModel::ExecuteGraph( else { ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false"); - status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied. + std::vector importedInputIds; + if (m_EnableImport) + { + importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc); + } + std::vector importedOutputIds; + if (m_EnableExport) + { + importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc); + } + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors, + importedInputIds, importedOutputIds); } if (status != armnn::Status::Success) @@ -389,7 +407,19 @@ bool ArmnnPreparedModel::ExecuteWithDummyInputs() else { ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false"); - status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied. + std::vector importedInputIds; + if (m_EnableImport) + { + importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc); + } + std::vector importedOutputIds; + if (m_EnableExport) + { + importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc); + } + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors, + importedInputIds, importedOutputIds); } if (status != armnn::Status::Success) { diff --git a/ArmnnPreparedModel.hpp b/ArmnnPreparedModel.hpp index 685d950e..0d19c077 100644 --- a/ArmnnPreparedModel.hpp +++ b/ArmnnPreparedModel.hpp @@ -41,7 +41,9 @@ public: const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, const bool asyncModelExecutionEnabled = false, - const unsigned int numberOfThreads = 1); + const unsigned int numberOfThreads = 1, + const bool importEnabled = false, + const bool exportEnabled = true); virtual ~ArmnnPreparedModel(); @@ -111,6 +113,8 @@ private: static std::unique_ptr m_Threadpool; std::shared_ptr m_WorkingMemHandle; const bool m_AsyncModelExecutionEnabled; + const bool m_EnableImport; + const bool m_EnableExport; }; } diff --git a/ArmnnPreparedModel_1_2.cpp b/ArmnnPreparedModel_1_2.cpp index 7f35e60f..37bc3a49 100644 --- a/ArmnnPreparedModel_1_2.cpp +++ b/ArmnnPreparedModel_1_2.cpp @@ -9,6 +9,8 @@ #include "Utils.hpp" +#include + #include #include #include @@ -151,7 +153,9 @@ ArmnnPreparedModel_1_2::ArmnnPreparedModel_1_2(armnn::NetworkId netw const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, const bool asyncModelExecutionEnabled, - const unsigned int numberOfThreads) + const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) @@ -159,6 +163,8 @@ ArmnnPreparedModel_1_2::ArmnnPreparedModel_1_2(armnn::NetworkId netw , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) , m_PreparedFromCache(false) { // Enable profiling if required. @@ -192,6 +198,8 @@ ArmnnPreparedModel_1_2::ArmnnPreparedModel_1_2(armnn::NetworkId netw const bool gpuProfilingEnabled, const bool asyncModelExecutionEnabled, const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled, const bool preparedFromCache) : m_NetworkId(networkId) , m_Runtime(runtime) @@ -199,6 +207,8 @@ ArmnnPreparedModel_1_2::ArmnnPreparedModel_1_2(armnn::NetworkId netw , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) , m_PreparedFromCache(preparedFromCache) { // Enable profiling if required. @@ -531,7 +541,20 @@ bool ArmnnPreparedModel_1_2::ExecuteGraph( else { ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled false"); - status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + + // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied. + std::vector importedInputIds; + if (m_EnableImport) + { + importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc); + } + std::vector importedOutputIds; + if (m_EnableExport) + { + importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc); + } + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors, + importedInputIds, importedOutputIds); } if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES) diff --git a/ArmnnPreparedModel_1_2.hpp b/ArmnnPreparedModel_1_2.hpp index 255fc187..6afcbbe4 100644 --- a/ArmnnPreparedModel_1_2.hpp +++ b/ArmnnPreparedModel_1_2.hpp @@ -47,7 +47,9 @@ public: const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, const bool asyncModelExecutionEnabled = false, - const unsigned int numberOfThreads = 1); + const unsigned int numberOfThreads = 1, + const bool importEnabled = false, + const bool exportEnabled = true); ArmnnPreparedModel_1_2(armnn::NetworkId networkId, armnn::IRuntime* runtime, @@ -55,6 +57,8 @@ public: const bool gpuProfilingEnabled, const bool asyncModelExecutionEnabled = false, const unsigned int numberOfThreads = 1, + const bool importEnabled = false, + const bool exportEnabled = true, const bool preparedFromCache = false); virtual ~ArmnnPreparedModel_1_2(); @@ -164,6 +168,8 @@ private: static std::unique_ptr m_Threadpool; std::shared_ptr m_WorkingMemHandle; const bool m_AsyncModelExecutionEnabled; + const bool m_EnableImport; + const bool m_EnableExport; const bool m_PreparedFromCache; }; diff --git a/ArmnnPreparedModel_1_3.cpp b/ArmnnPreparedModel_1_3.cpp index 36575b86..34c42ecd 100644 --- a/ArmnnPreparedModel_1_3.cpp +++ b/ArmnnPreparedModel_1_3.cpp @@ -12,6 +12,8 @@ #include "ArmnnPreparedModel_1_3.hpp" #include "Utils.hpp" +#include + #include #include #include @@ -172,7 +174,9 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw const bool gpuProfilingEnabled, V1_3::Priority priority, const bool asyncModelExecutionEnabled, - const unsigned int numberOfThreads) + const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) @@ -181,6 +185,8 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_ModelPriority(priority) , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) , m_PreparedFromCache(false) { // Enable profiling if required. @@ -215,6 +221,8 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw V1_3::Priority priority, const bool asyncModelExecutionEnabled, const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled, const bool preparedFromCache) : m_NetworkId(networkId) , m_Runtime(runtime) @@ -223,6 +231,8 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_ModelPriority(priority) , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) , m_PreparedFromCache(preparedFromCache) { // Enable profiling if required. @@ -824,7 +834,19 @@ Return ArmnnPreparedModel_1_3::ExecuteGraph( else { ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled false"); - status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied. + std::vector importedInputIds; + if (m_EnableImport) + { + importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc); + } + std::vector importedOutputIds; + if (m_EnableExport) + { + importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc); + } + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors, + importedInputIds, importedOutputIds); } if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES) diff --git a/ArmnnPreparedModel_1_3.hpp b/ArmnnPreparedModel_1_3.hpp index cd5fc0ed..cb3c7684 100644 --- a/ArmnnPreparedModel_1_3.hpp +++ b/ArmnnPreparedModel_1_3.hpp @@ -55,7 +55,9 @@ public: const bool gpuProfilingEnabled, V1_3::Priority priority = V1_3::Priority::MEDIUM, const bool asyncModelExecutionEnabled = false, - const unsigned int numberOfThreads = 1); + const unsigned int numberOfThreads = 1, + const bool importEnabled = false, + const bool exportEnabled = true); ArmnnPreparedModel_1_3(armnn::NetworkId networkId, armnn::IRuntime* runtime, @@ -64,6 +66,8 @@ public: V1_3::Priority priority = V1_3::Priority::MEDIUM, const bool asyncModelExecutionEnabled = false, const unsigned int numberOfThreads = 1, + const bool importEnabled = false, + const bool exportEnabled = true, const bool preparedFromCache = false); virtual ~ArmnnPreparedModel_1_3(); @@ -201,6 +205,8 @@ private: static std::unique_ptr m_Threadpool; std::shared_ptr m_WorkingMemHandle; const bool m_AsyncModelExecutionEnabled; + const bool m_EnableImport; + const bool m_EnableExport; const bool m_PreparedFromCache; }; diff --git a/DriverOptions.cpp b/DriverOptions.cpp index 8fd5c477..0018d975 100644 --- a/DriverOptions.cpp +++ b/DriverOptions.cpp @@ -40,6 +40,8 @@ DriverOptions::DriverOptions(armnn::Compute computeDevice, bool fp16Enabled) , m_NumberOfThreads(0) , m_EnableAsyncModelExecution(false) , m_ArmnnNumberOfThreads(1) + , m_EnableImport(false) + , m_EnableExport(true) { } @@ -56,6 +58,8 @@ DriverOptions::DriverOptions(const std::vector& backends, bool , m_NumberOfThreads(0) , m_EnableAsyncModelExecution(false) , m_ArmnnNumberOfThreads(1) + , m_EnableImport(false) + , m_EnableExport(true) { } @@ -71,6 +75,8 @@ DriverOptions::DriverOptions(int argc, char** argv) , m_NumberOfThreads(0) , m_EnableAsyncModelExecution(false) , m_ArmnnNumberOfThreads(1) + , m_EnableImport(false) + , m_EnableExport(true) { std::string unsupportedOperationsAsString; std::string clTunedParametersModeAsString; @@ -168,7 +174,13 @@ DriverOptions::DriverOptions(int argc, char** argv) "Assign the number of threads used by ArmNN. " "Input value must be at least 1. " "Default is set to 1.", - cxxopts::value(m_ArmnnNumberOfThreads)->default_value("1")); + cxxopts::value(m_ArmnnNumberOfThreads)->default_value("1")) + + ("I,enableImport", "Enable Importing of input buffers", + cxxopts::value(m_EnableImport)->default_value("false")) + + ("E,enableExport", "Enable Exporting of output buffers", + cxxopts::value(m_EnableExport)->default_value("true")); } catch (const std::exception& e) { diff --git a/DriverOptions.hpp b/DriverOptions.hpp index e1d25c48..ee68a945 100644 --- a/DriverOptions.hpp +++ b/DriverOptions.hpp @@ -42,6 +42,8 @@ public: unsigned int GetNumberOfThreads() const { return m_NumberOfThreads; } bool isAsyncModelExecutionEnabled() const { return m_EnableAsyncModelExecution; }; unsigned int getNoOfArmnnThreads() const { return m_ArmnnNumberOfThreads; }; + bool isImportEnabled() const { return m_EnableImport; }; + bool isExportEnabled() const { return m_EnableExport; }; private: std::vector m_Backends; @@ -63,6 +65,8 @@ private: unsigned int m_NumberOfThreads; bool m_EnableAsyncModelExecution; unsigned int m_ArmnnNumberOfThreads; + bool m_EnableImport; + bool m_EnableExport; }; } // namespace armnn_driver diff --git a/test/Concurrent.cpp b/test/Concurrent.cpp index 4113a8da..71119cde 100644 --- a/test/Concurrent.cpp +++ b/test/Concurrent.cpp @@ -82,15 +82,16 @@ DOCTEST_TEST_CASE("ConcurrentExecute") // build the requests V1_0::Request requests[maxRequests]; + android::sp inMemory[maxRequests]; android::sp outMemory[maxRequests]; + float indata[] = {2, 32, 16}; float* outdata[maxRequests]; for (size_t i = 0; i < maxRequests; ++i) { requests[i].inputs = hidl_vec{input}; requests[i].outputs = hidl_vec{output}; // set the input data (matching source test) - float indata[] = {2, 32, 16}; - AddPoolAndSetData(3, requests[i], indata); + inMemory[i] = AddPoolAndSetData(3, requests[i], indata); // add memory for the output outMemory[i] = AddPoolAndGetData(1, requests[i]); outdata[i] = static_cast(static_cast(outMemory[i]->getPointer())); diff --git a/test/DriverTestHelpers.hpp b/test/DriverTestHelpers.hpp index 383c8fc2..98be0903 100644 --- a/test/DriverTestHelpers.hpp +++ b/test/DriverTestHelpers.hpp @@ -186,13 +186,15 @@ android::sp AddPoolAndGetData(uint32_t size, V1_0::Request& request) } template -void AddPoolAndSetData(uint32_t size, V1_0::Request& request, const T* data) +android::sp AddPoolAndSetData(uint32_t size, V1_0::Request& request, const T* data) { android::sp memory = AddPoolAndGetData(size, request); T* dst = static_cast(static_cast(memory->getPointer())); memcpy(dst, data, size * sizeof(T)); + + return memory; } template