From 8f397a1efed11e17e9f8cb12b53a72b7e32ab978 Mon Sep 17 00:00:00 2001 From: Sadik Armagan Date: Fri, 17 Jun 2022 15:38:22 +0100 Subject: IVGCVSW-6989 "Merged experimental/armnn_shim_sl" * Updated Serializer CMakeLists.txt to build armnnSerializerObj * Added constant tensors as input support to SL Signed-off-by: Sadik Armagan Change-Id: I22f6cf50147d99a01f7fe70d7446b114a4c57af3 --- shim/sl/canonical/ArmnnPreparedModel.cpp | 697 +++++++++++++++++++++++++++++++ 1 file changed, 697 insertions(+) create mode 100644 shim/sl/canonical/ArmnnPreparedModel.cpp (limited to 'shim/sl/canonical/ArmnnPreparedModel.cpp') diff --git a/shim/sl/canonical/ArmnnPreparedModel.cpp b/shim/sl/canonical/ArmnnPreparedModel.cpp new file mode 100644 index 0000000000..22e09008ba --- /dev/null +++ b/shim/sl/canonical/ArmnnPreparedModel.cpp @@ -0,0 +1,697 @@ +// +// Copyright © 2022 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#define LOG_TAG "arm-armnn-sl" + +#include "ArmnnPreparedModel.hpp" +#include "CanonicalUtils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace android; +using namespace android::nn; + +static const Timing g_NoTiming = {}; + +namespace { + +using namespace armnn_driver; + +unsigned long MicrosecondsDuration(android::nn::TimePoint endPoint, android::nn::TimePoint startPoint) +{ + return static_cast(std::chrono::duration_cast( + endPoint - startPoint).count()); +} + +bool ValidateRequestArgument(const Request::Argument& requestArg, const armnn::TensorInfo& tensorInfo) +{ + if (requestArg.dimensions.size() != 0) + { + if (requestArg.dimensions.size() != tensorInfo.GetNumDimensions()) + { + VLOG(DRIVER) << "Mismatched dimensions (request argument: " + << requestArg.dimensions.size() << " expected: " << tensorInfo.GetNumDimensions(); + return false; + } + + for (unsigned int d = 0; d < tensorInfo.GetNumDimensions(); ++d) + { + if (requestArg.dimensions[d] != 0 && requestArg.dimensions[d] != tensorInfo.GetShape()[d]) + { + VLOG(DRIVER) << "Mismatched dimensions " << d + << " (request argument: " << requestArg.dimensions[d] + << " expected: " << tensorInfo.GetShape()[d]; + return false; + } + } + } + + return true; +} + +armnn::Tensor GetTensorForRequestArgument(const Request::Argument& requestArg, + const armnn::TensorInfo& tensorInfo, + const std::vector<::android::nn::RunTimePoolInfo>& requestPools) +{ + if (!ValidateRequestArgument(requestArg, tensorInfo)) + { + return armnn::Tensor(); + } + + if (requestArg.lifetime == Request::Argument::LifeTime::POINTER) + { + return armnn::Tensor(tensorInfo, GetMemoryFromPointer(requestArg)); + } + else if (requestArg.lifetime == Request::Argument::LifeTime::POOL) + { + return armnn::Tensor(tensorInfo, GetMemoryFromPool(requestArg.location, requestPools)); + } + return armnn::Tensor(); +} + +inline std::string BuildTensorName(const char* tensorNamePrefix, std::size_t index) +{ + return tensorNamePrefix + std::to_string(index); +} + +bool IsPointerTypeMemory(const Request& request) +{ + for (auto& input : request.inputs) + { + if (input.lifetime == Request::Argument::LifeTime::POINTER) + { + return true; + } + } + + for (auto& output: request.outputs) + { + if (output.lifetime == Request::Argument::LifeTime::POINTER) + { + return true; + } + } + + return false; +} + +} // anonymous namespace + +using namespace android::nn; + +namespace armnn_driver +{ + +void ArmnnPreparedModel::Init() +{ + // Enable profiling if required. + m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); +} + +ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, + armnn::IRuntime* runtime, + const Model& model, + const std::string& requestInputsAndOutputsDumpDir, + const bool gpuProfilingEnabled, + Priority priority) + : m_NetworkId(networkId) + , m_Runtime(runtime) + , m_Model(model) + , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) + , m_GpuProfilingEnabled(gpuProfilingEnabled) + , m_ModelPriority(priority) + , m_PrepareFromCache(false) +{ + Init(); +} + +ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, + armnn::IRuntime* runtime, + const std::string& requestInputsAndOutputsDumpDir, + const bool gpuProfilingEnabled, + Priority priority, + const bool prepareModelFromCache) + : m_NetworkId(networkId) + , m_Runtime(runtime) + , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) + , m_GpuProfilingEnabled(gpuProfilingEnabled) + , m_ModelPriority(priority) + , m_PrepareFromCache(prepareModelFromCache) +{ + Init(); +} + + +ErrorStatus ArmnnPreparedModel::PrepareMemoryForInputs( + armnn::InputTensors& inputs, + const Request& request, + const std::vector& memPools) const +{ + inputs.reserve(request.inputs.size()); + for (unsigned int i = 0; i < request.inputs.size(); i++) + { + const auto& inputArg = request.inputs[i]; + + armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + // inputs (of type InputTensors) is composed of a vector of ConstTensors. + // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. + inputTensorInfo.SetConstant(); + const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools); + + if (inputTensor.GetMemoryArea() == nullptr) + { + VLOG(DRIVER) << "Cannot execute request. Error converting request input " << i << "to tensor."; + return ErrorStatus::GENERAL_FAILURE; + } + inputs.emplace_back(i, inputTensor); + } + + return ErrorStatus::NONE; +} + +ErrorStatus ArmnnPreparedModel::PrepareMemoryForOutputs( + armnn::OutputTensors& outputs, + std::vector &outputShapes, + const Request& request, + const std::vector& memPools) const +{ + outputs.reserve(request.outputs.size()); + for (unsigned int i = 0; i < request.outputs.size(); i++) + { + auto& outputArg = request.outputs[i]; + + armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); + armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools); + if (outputTensor.GetMemoryArea() == nullptr) + { + VLOG(DRIVER) << "Cannot execute request. Error converting request output " << i << "to tensor."; + return ErrorStatus::GENERAL_FAILURE; + } + + const size_t outputSize = outputTensorInfo.GetNumBytes(); + + unsigned int count = 0; + std::for_each(outputArg.dimensions.begin(), outputArg.dimensions.end(), [&](auto dim) + { + if (dim != 0) + { + outputTensorInfo.GetShape()[count] = dim; + } + else + { + outputTensorInfo.GetShape()[count] = outputArg.dimensions.size(); + } + + count++; + }); + + outputs.emplace_back(i, outputTensor); + outputShapes[i] = ComputeShape(outputTensorInfo); + + if (outputArg.location.length < outputSize) + { + VLOG(DRIVER) << "ArmnnPreparedModel::Execute failed outputArg.location.length " + << std::to_string(outputArg.location.length).c_str() + << " < outputSize " << std::to_string(outputSize).c_str(); + outputShapes[i].isSufficient = false; + return ErrorStatus::OUTPUT_INSUFFICIENT_SIZE; + } + + //TODO: Need to check for Request::Argument::LifeTime::POINTER + if (outputArg.lifetime == Request::Argument::LifeTime::POOL) + { + size_t bufferSize = memPools.at(outputArg.location.poolIndex).getSize(); + if (bufferSize < outputSize) + { + VLOG(DRIVER) << "ArmnnPreparedModel::Execute failed bufferSize " + << std::to_string(outputArg.location.length).c_str() + << " < outputSize " << std::to_string(outputSize).c_str(); + outputShapes[i].isSufficient = false; + return ErrorStatus::OUTPUT_INSUFFICIENT_SIZE; + } + } + } + return ErrorStatus::NONE; +} + +ErrorStatus ArmnnPreparedModel::PrepareMemoryForIO(armnn::InputTensors& inputs, + armnn::OutputTensors& outputs, + std::vector& memPools, + const Request& request) const +{ + //Check memory pools are not empty + // add the inputs and outputs with their data + try + { + if (!setRunTimePoolInfosFromMemoryPools(&memPools, request.pools)) + { + return ErrorStatus::INVALID_ARGUMENT; + } + + if (PrepareMemoryForInputs(inputs, request, memPools) != ErrorStatus::NONE) + { + VLOG(DRIVER) << "Failed when preparing memory for Inputs"; + return ErrorStatus::GENERAL_FAILURE; + } + + std::vector outputShapes(request.outputs.size()); + + auto errorStatus = PrepareMemoryForOutputs(outputs, outputShapes, request, memPools); + if (errorStatus != ErrorStatus::NONE) + { + return errorStatus; + } + } + catch (armnn::Exception& e) + { + VLOG(DRIVER) << "armnn::Exception caught while preparing for EnqueueWorkload: " << e.what(); + return ErrorStatus::GENERAL_FAILURE; + } + catch (std::exception& e) + { + VLOG(DRIVER) << "std::exception caught while preparing for EnqueueWorkload: " << e.what(); + return ErrorStatus::GENERAL_FAILURE; + } + + return ErrorStatus::NONE; +} + +ExecutionResult, Timing>> ArmnnPreparedModel::execute( + const Request& request, + MeasureTiming measureTiming, + const OptionalTimePoint& deadline, + const OptionalDuration&, + const std::vector& hints, + const std::vector& extensionNameToPrefix) const +{ + VLOG(DRIVER) << "CanonicalDriver::PreparedModel::execute()"; + + CanonicalExecutionContext ctx; + if (measureTiming == MeasureTiming::YES) + { + ctx.measureTimings = measureTiming; + ctx.driverStart = Clock::now(); + } + + if (!m_PrepareFromCache) + { + const auto modelRequest = validateRequestForModel(request, m_Model); + if (!modelRequest.ok()) + { + return NN_ERROR(ErrorStatus::INVALID_ARGUMENT) << modelRequest.error(); + } + VLOG(DRIVER) << "ArmnnPreparedModel::execute(): " << GetModelSummary(m_Model).c_str(); + } + if (hasDeadlinePassed(deadline)) { + return NN_ERROR(ErrorStatus::MISSED_DEADLINE_PERSISTENT); + } + + // map the memory pool into shared pointers + // use a shared memory pools vector on the heap, as it is passed to the request thread + auto memPools = std::make_shared>(); + + // allocate the tensors on the heap, as they are passed to the request thread + auto inputTensors = std::make_shared(); + auto outputTensors = std::make_shared(); + + ErrorStatus theErrorStatus = ErrorStatus::NONE; + + auto isPointerTypeMemory = IsPointerTypeMemory(request); + nn::RequestRelocation relocation; + if (isPointerTypeMemory) + { + std::optional maybeRequestInShared; + auto executionResult = + nn::convertRequestFromPointerToShared( + &request, nn::kDefaultRequestMemoryAlignment, nn::kMinMemoryPadding, + &maybeRequestInShared, &relocation); + if(!executionResult.has_value()) + { + VLOG(DRIVER) << "ArmnnPreparedModel::PrepareMemoryForIO::Failed to convertRequestFromPointerToShared."; + return NN_ERROR(ErrorStatus::GENERAL_FAILURE) + << "ArmnnPreparedModel convertRequestFromPointerToShared failed"; + } + const nn::Request& requestInShared = std::move(executionResult).value(); + if (relocation.input) + { + relocation.input->flush(); + } + + theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, requestInShared); + } + else + { + theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, request); + } + + switch(theErrorStatus) + { + case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: + return NN_ERROR(ErrorStatus::OUTPUT_INSUFFICIENT_SIZE); + case ErrorStatus::GENERAL_FAILURE: + return NN_ERROR(ErrorStatus::GENERAL_FAILURE); + case ErrorStatus::INVALID_ARGUMENT: + return NN_ERROR(ErrorStatus::INVALID_ARGUMENT); + default: + {} + } + + std::vector outputShapes(outputTensors->size()); + for (unsigned int i = 0; i < outputTensors->size(); i++) + { + std::pair outputTensorPair = (*outputTensors)[i]; + const armnn::Tensor outputTensor = outputTensorPair.second; + const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo(); + + outputShapes[i] = ComputeShape(outputTensorInfo); + } + Timing theTiming; + + VLOG(DRIVER) << "ArmnnPreparedModel::execute(...) before ExecuteGraph"; + auto errorStatus = ExecuteGraph(memPools, *inputTensors, *outputTensors, ctx); + if (errorStatus != ErrorStatus::NONE) + { + return NN_ERROR(errorStatus) << "execute() failed"; + } + VLOG(DRIVER) << "ArmnnPreparedModel::execute(...) after ExecuteGraph"; + if (isPointerTypeMemory && relocation.output) + { + relocation.output->flush(); + } + + return std::make_pair(outputShapes, theTiming); +} + +ErrorStatus ArmnnPreparedModel::ExecuteGraph( + std::shared_ptr>& pMemPools, + armnn::InputTensors& inputTensors, + armnn::OutputTensors& outputTensors, + CanonicalExecutionContext ctx) const +{ + VLOG(DRIVER) << "ArmnnPreparedModel::ExecuteGraph(...)"; + + DumpTensorsIfRequired("Input", inputTensors); + + try + { + if (ctx.measureTimings == MeasureTiming::YES) + { + ctx.deviceStart = Clock::now(); + } + armnn::Status status; + VLOG(DRIVER) << "ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false"; + + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + + if (ctx.measureTimings == MeasureTiming::YES) + { + ctx.deviceEnd = Clock::now(); + } + if (status != armnn::Status::Success) + { + VLOG(DRIVER) << "ArmnnPreparedModel:ExecuteGraph EnqueueWorkload failed"; + return ErrorStatus::GENERAL_FAILURE; + } + } + catch (armnn::Exception& e) + { + VLOG(DRIVER) << "armnn:Exception caught from EnqueueWorkload: " << e.what(); + return ErrorStatus::GENERAL_FAILURE; + } + catch (std::exception& e) + { + VLOG(DRIVER) << "std::exception caught from EnqueueWorkload: " << e.what(); + return ErrorStatus::GENERAL_FAILURE; + } + + CommitPools(*pMemPools); + DumpTensorsIfRequired("Output", outputTensors); + + if (ctx.measureTimings == MeasureTiming::YES) + { + ctx.driverEnd = Clock::now(); + Timing timing; + timing.timeOnDevice = ctx.deviceEnd - ctx.deviceStart; + timing.timeInDriver = ctx.driverEnd - ctx.driverStart; + VLOG(DRIVER) << "ArmnnPreparedModel::execute timing - Device = " + << timing.timeOnDevice << "Driver = " << timing.timeInDriver; + } + return ErrorStatus::NONE; +} + +Priority ArmnnPreparedModel::GetModelPriority() const +{ + return m_ModelPriority; +} + + +GeneralResult> ArmnnPreparedModel::executeFenced( + const Request& request, + const std::vector& waitFor, + MeasureTiming measureTiming, + const OptionalTimePoint& deadline, + const OptionalDuration&, + const OptionalDuration&, + const std::vector& hints, + const std::vector& extensionNameToPrefix) const +{ + VLOG(DRIVER) << "ArmnnPreparedModel::executeFenced()"; + + if (!m_PrepareFromCache) { + const auto modelRequest = validateRequestForModel(request, m_Model); + if (!modelRequest.ok()) + { + return NN_ERROR(ErrorStatus::INVALID_ARGUMENT) << modelRequest.error(); + } + VLOG(DRIVER) << "ArmnnPreparedModel::executeFenced(): " << GetModelSummary(m_Model).c_str(); + } + if (hasDeadlinePassed(deadline)) + { + return NN_ERROR(ErrorStatus::MISSED_DEADLINE_PERSISTENT); + } + + CanonicalExecutionContext ctx; + if (measureTiming == MeasureTiming::YES) + { + ctx.measureTimings = measureTiming; + ctx.driverStart = Clock::now(); + } + + // Wait for the dependent events to signal + for (const auto& syncFence : waitFor) + { + if (!syncFence.getSharedHandle()) + { + return NN_ERROR(ErrorStatus::INVALID_ARGUMENT); + } + if (syncFence.syncWait({}) != SyncFence::FenceState::SIGNALED) + { + return NN_ERROR(ErrorStatus::GENERAL_FAILURE) << "syncWait failed"; + } + } + + android::nn::TimePoint fenceExecutionStart; + if (measureTiming == MeasureTiming::YES) + { + fenceExecutionStart = Clock::now(); + } + + // map the memory pool into shared pointers + // use a shared memory pools vector on the heap, as it is passed to the request thread + auto memPools = std::make_shared>(); + + // allocate the tensors on the heap, as they are passed to the request thread + auto inputTensors = std::make_shared(); + auto outputTensors = std::make_shared(); + + ErrorStatus theErrorStatus = ErrorStatus::NONE; + + auto isPointerTypeMemory = IsPointerTypeMemory(request); + nn::RequestRelocation relocation; + if (isPointerTypeMemory) + { + std::optional maybeRequestInShared; + auto executionResult = + nn::convertRequestFromPointerToShared( + &request, nn::kDefaultRequestMemoryAlignment, nn::kMinMemoryPadding, + &maybeRequestInShared, &relocation); + if(!executionResult.has_value()) + { + VLOG(DRIVER) << "ArmnnPreparedModel::PrepareMemoryForIO::Failed to convertRequestFromPointerToShared."; + return NN_ERROR(ErrorStatus::GENERAL_FAILURE) + << "ArmnnPreparedModel convertRequestFromPointerToShared failed"; + } + const nn::Request& requestInShared = std::move(executionResult).value(); + if (relocation.input) + { + relocation.input->flush(); + } + + theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, requestInShared); + } + else + { + theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, request); + } + + if (theErrorStatus != ErrorStatus::NONE) + { + return NN_ERROR(ErrorStatus::INVALID_ARGUMENT) << "executeFenced() failed"; + } + + Timing timingSinceLaunch = {}; + Timing timingAfterFence = {}; + if (measureTiming == MeasureTiming::YES) + { + timingAfterFence.timeOnDevice = ctx.deviceEnd - ctx.deviceStart; + timingAfterFence.timeInDriver = ctx.driverEnd - fenceExecutionStart; + VLOG(DRIVER) << "executeFenced timingSinceLaunch = " << timingAfterFence.timeOnDevice; + VLOG(DRIVER) << "executeFenced timingAfterFence = " << timingAfterFence.timeInDriver; + } + + VLOG(DRIVER) << "ArmnnCanonicalPreparedModel::executeFenced(...) before ExecuteGraph"; + auto errorStatus = ExecuteGraph(memPools, *inputTensors, *outputTensors, ctx); + VLOG(DRIVER) << "ArmnnCanonicalPreparedModel::executeFenced(...) after ExecuteGraph"; + if (isPointerTypeMemory && relocation.output) + { + relocation.output->flush(); + } + + ExecuteFencedInfoCallback armnnFencedExecutionCallback = + [timingSinceLaunch, timingAfterFence, errorStatus]() { + + GeneralResult> result; + + switch(errorStatus) + { + case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: + result.error().code = (ErrorStatus::OUTPUT_INSUFFICIENT_SIZE); + case ErrorStatus::GENERAL_FAILURE: + result.error().code = (ErrorStatus::GENERAL_FAILURE); + case ErrorStatus::INVALID_ARGUMENT: + result.error().code = (ErrorStatus::INVALID_ARGUMENT); + default: + { + result.value() = std::make_pair(timingSinceLaunch, timingAfterFence); + } + } + return result; + }; + return std::make_pair(SyncFence::createAsSignaled(), std::move(armnnFencedExecutionCallback )); +} + +GeneralResult ArmnnPreparedModel::createReusableExecution( + const Request& request, + MeasureTiming measureTiming, + const OptionalDuration& loopTimeoutDuration, + const std::vector& hints, + const std::vector& extensionNameToPrefix) const +{ + VLOG(DRIVER) << "ArmnnPreparedModel::createReusableExecution()"; + return std::make_shared(shared_from_this(), + request, + measureTiming, + loopTimeoutDuration); +} + +GeneralResult ArmnnPreparedModel::configureExecutionBurst() const +{ + // TODO: Implement BURST + return nullptr; +} + +std::any ArmnnPreparedModel::getUnderlyingResource() const +{ + return &m_Model; +} + +template +void ArmnnPreparedModel::DumpTensorsIfRequired(char const* tensorNamePrefix, + const TensorBindingCollection& tensorBindings) const +{ + if (!m_RequestInputsAndOutputsDumpDir.empty()) + { + const std::string requestName = std::to_string(m_NetworkId) + ".dump"; + for (std::size_t i = 0u; i < tensorBindings.size(); ++i) + { + DumpTensor(m_RequestInputsAndOutputsDumpDir, + requestName, + BuildTensorName(tensorNamePrefix, i), + tensorBindings[i].second); + } + } +} + +ArmnnPreparedModel::~ArmnnPreparedModel() +{ + VLOG(DRIVER) << "ArmnnPreparedModel::~ArmnnPreparedModel()"; + // Get a hold of the profiler used by this model. + if (m_GpuProfilingEnabled) + { + auto profiler = m_Runtime->GetProfiler(m_NetworkId); + if (profiler) + { + // Dump the profiling info to a file if required. + DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, + m_RequestInputsAndOutputsDumpDir, + m_NetworkId, + profiler.get()); + } + } + // Unload the network associated with this model + m_Runtime->UnloadNetwork(m_NetworkId); +} + +bool ArmnnPreparedModel::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs) const +{ + std::vector> storage; + armnn::InputTensors inputTensors; + for (unsigned int i = 0; i < numInputs; i++) + { + armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors. + // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. + inputTensorInfo.SetConstant(); + storage.emplace_back(inputTensorInfo.GetNumBytes()); + const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data()); + + inputTensors.emplace_back(i, inputTensor); + } + + armnn::OutputTensors outputTensors; + for (unsigned int i = 0; i < numOutputs; i++) + { + const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); + storage.emplace_back(outputTensorInfo.GetNumBytes()); + const armnn::Tensor outputTensor(outputTensorInfo, storage.back().data()); + + outputTensors.emplace_back(i, outputTensor); + } + CanonicalExecutionContext ctx; + ctx.measureTimings = MeasureTiming::NO; + auto memPools = std::make_shared>(); + + auto errorStatus = ExecuteGraph(memPools, + inputTensors, + outputTensors, + ctx); + + return errorStatus == ErrorStatus::NONE; +} + +} // namespace armnn_driver -- cgit v1.2.1