From 72b415674e7df9152a1e5d206b70e4d7a663f3fc Mon Sep 17 00:00:00 2001 From: alered01 Date: Thu, 7 May 2020 14:58:29 +0100 Subject: Adding more performance metrics * Implemented CLTuning flow for ExecuteNetwork tests * Added --tuning-path to specify tuning file to use/create * Added --tuning-level to specify tuning level to use as well as enable extra tuning run to generate the tuning file * Fixed issue where TuningLevel was being parsed incorrectly * Added measurements for initialization, network parsing, network optimization, tuning, and shutdown * Added flag to control number of iterations inference is run for Signed-off-by: alered01 Change-Id: Ic739ff26e136e32aff9f0995217c1c3207008ca4 --- include/armnn/utility/Timer.hpp | 25 ++++ src/armnn/Runtime.cpp | 8 ++ src/backends/cl/ClBackendContext.cpp | 10 +- tests/ExecuteNetwork/ExecuteNetwork.cpp | 47 ++++++- tests/InferenceModel.hpp | 31 ++--- .../NetworkExecutionUtils.hpp | 154 ++++++++++++++------- 6 files changed, 202 insertions(+), 73 deletions(-) create mode 100644 include/armnn/utility/Timer.hpp diff --git a/include/armnn/utility/Timer.hpp b/include/armnn/utility/Timer.hpp new file mode 100644 index 0000000000..daf689e74f --- /dev/null +++ b/include/armnn/utility/Timer.hpp @@ -0,0 +1,25 @@ +// +// Copyright © 2020 Arm Ltd. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include +#include + +namespace armnn +{ + +inline std::chrono::high_resolution_clock::time_point GetTimeNow() +{ + return std::chrono::high_resolution_clock::now(); +} + +inline std::chrono::duration GetTimeDuration( + std::chrono::high_resolution_clock::time_point start_time) +{ + return std::chrono::duration(GetTimeNow() - start_time); +} + +} \ No newline at end of file diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index dbdd409784..b1b7d51d7c 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -171,6 +172,7 @@ Runtime::Runtime(const CreationOptions& options) : m_NetworkIdCounter(0), m_ProfilingService(*this) { + const auto start_time = armnn::GetTimeNow(); ARMNN_LOG(info) << "ArmNN v" << ARMNN_VERSION << "\n"; if ( options.m_ProfilingOptions.m_TimelineEnabled && !options.m_ProfilingOptions.m_EnableProfiling ) @@ -225,10 +227,14 @@ Runtime::Runtime(const CreationOptions& options) m_ProfilingService.ConfigureProfilingService(options.m_ProfilingOptions); m_DeviceSpec.AddSupportedBackends(supportedBackends); + + ARMNN_LOG(info) << "Initialization time: " << std::setprecision(2) + << std::fixed << armnn::GetTimeDuration(start_time).count() << " ms\n"; } Runtime::~Runtime() { + const auto start_time = armnn::GetTimeNow(); std::vector networkIDs; try { @@ -272,6 +278,8 @@ Runtime::~Runtime() m_BackendContexts.clear(); BackendRegistryInstance().SetProfilingService(armnn::EmptyOptional()); + ARMNN_LOG(info) << "Shutdown time: " << std::setprecision(2) + << std::fixed << armnn::GetTimeDuration(start_time).count() << " ms\n"; } LoadedNetwork* Runtime::GetLoadedNetworkPtr(NetworkId networkId) const diff --git a/src/backends/cl/ClBackendContext.cpp b/src/backends/cl/ClBackendContext.cpp index bfe93bdc01..42f42b3023 100644 --- a/src/backends/cl/ClBackendContext.cpp +++ b/src/backends/cl/ClBackendContext.cpp @@ -79,7 +79,7 @@ TuningLevel ParseTuningLevel(const BackendOptions::Var& value, TuningLevel defau { if (value.IsInt()) { - int v = value.IsInt(); + int v = value.AsInt(); if (v > static_cast(TuningLevel::Exhaustive) || v < static_cast(TuningLevel::None)) { @@ -218,18 +218,18 @@ ClBackendContext::ClBackendContext(const IRuntime::CreationOptions& options) ConfigureTuner(*(m_Tuner.get()), tuningLevel); - if (!m_TuningFile.empty()) + if (!m_TuningFile.empty() && tuningLevel == TuningLevel::None) { try { m_Tuner->load_from_file(m_TuningFile.c_str()); - } catch (const std::exception& e) + } + catch (const std::exception& e) { ARMNN_LOG(warning) << "Could not load GpuAcc tuner data file."; } - - tuner = m_Tuner.get(); } + tuner = m_Tuner.get(); } m_ClContextControlWrapper = std::make_unique( diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp index 57b8692701..66d8e1378f 100644 --- a/tests/ExecuteNetwork/ExecuteNetwork.cpp +++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp @@ -35,6 +35,10 @@ int main(int argc, const char* argv[]) uint32_t counterCapturePeriod; std::string fileFormat; + size_t iterations = 1; + int tuningLevel = 0; + std::string tuningPath; + double thresholdTime = 0.0; size_t subgraphId = 0; @@ -121,6 +125,14 @@ int main(int argc, const char* argv[]) "If profiling is enabled in 'file-only' mode this is the capture period that will be used in the test") ("file-format", po::value(&fileFormat)->default_value("binary"), "If profiling is enabled specifies the output file format") + ("iterations", po::value(&iterations)->default_value(1), + "Number of iterations to run the network for, default is set to 1") + ("tuning-path", po::value(&tuningPath), + "Path to tuning file. Enables use of CL tuning") + ("tuning-level", po::value(&tuningLevel)->default_value(0), + "Sets the tuning level which enables a tuning run which will update/create a tuning file. " + "Available options are: 1 (Rapid), 2 (Normal), 3 (Exhaustive). " + "Requires tuning-path to be set, default is set to 0 (No tuning run)") ("parse-unsupported", po::bool_switch()->default_value(false), "Add unsupported operators as stand-in layers (where supported by parser)"); } @@ -275,6 +287,33 @@ int main(int argc, const char* argv[]) // Remove duplicates from the list of compute devices. RemoveDuplicateDevices(computeDevices); +#if defined(ARMCOMPUTECL_ENABLED) + std::shared_ptr tuned_params; + + if (tuningPath != "") + { + if (tuningLevel != 0) + { + RunCLTuning(tuningPath, tuningLevel, modelFormat, inputTensorShapes, computeDevices, + dynamicBackendsPath, modelPath, inputNames, inputTensorDataFilePaths, inputTypes, quantizeInput, + outputTypes, outputNames, outputTensorFiles, dequantizeOutput, enableProfiling, + enableFp16TurboMode, enableBf16TurboMode, thresholdTime, printIntermediate, subgraphId, + enableLayerDetails, parseUnsupported); + } + ARMNN_LOG(info) << "Using tuning params: " << tuningPath << "\n"; + options.m_BackendOptions.emplace_back( + armnn::BackendOptions + { + "GpuAcc", + { + {"TuningLevel", 0}, + {"TuningFile", tuningPath.c_str()}, + {"KernelProfilingEnabled", enableProfiling} + } + } + ); + } +#endif try { CheckOptionDependencies(vm); @@ -288,9 +327,9 @@ int main(int argc, const char* argv[]) // Create runtime std::shared_ptr runtime(armnn::IRuntime::Create(options)); - return RunTest(modelFormat, inputTensorShapes, computeDevices, dynamicBackendsPath, modelPath, inputNames, - inputTensorDataFilePaths, inputTypes, quantizeInput, outputTypes, outputNames, - outputTensorFiles, dequantizeOutput, enableProfiling, enableFp16TurboMode, enableBf16TurboMode, - thresholdTime, printIntermediate, subgraphId, enableLayerDetails, parseUnsupported, runtime); + return RunTest(modelFormat, inputTensorShapes, computeDevices, dynamicBackendsPath, modelPath, + inputNames, inputTensorDataFilePaths, inputTypes, quantizeInput, outputTypes, outputNames, + outputTensorFiles, dequantizeOutput, enableProfiling, enableFp16TurboMode, enableBf16TurboMode, + thresholdTime, printIntermediate, subgraphId, enableLayerDetails, parseUnsupported, iterations, runtime); } } diff --git a/tests/InferenceModel.hpp b/tests/InferenceModel.hpp index 410bc7c04e..781cef4ed0 100644 --- a/tests/InferenceModel.hpp +++ b/tests/InferenceModel.hpp @@ -6,6 +6,8 @@ #pragma once #include +#include +#include #include #include @@ -31,7 +33,6 @@ #include #include -#include #include #include #include @@ -399,8 +400,12 @@ public: throw armnn::Exception("Some backend IDs are invalid: " + invalidBackends); } + const auto parsing_start_time = armnn::GetTimeNow(); armnn::INetworkPtr network = CreateNetworkImpl::Create(params, m_InputBindings, m_OutputBindings); + ARMNN_LOG(info) << "Network parsing time: " << std::setprecision(2) + << std::fixed << armnn::GetTimeDuration(parsing_start_time).count() << " ms\n"; + armnn::IOptimizedNetworkPtr optNet{nullptr, [](armnn::IOptimizedNetwork*){}}; { ARMNN_SCOPED_HEAP_PROFILING("Optimizing"); @@ -410,7 +415,12 @@ public: options.m_ReduceFp32ToBf16 = params.m_EnableBf16TurboMode; options.m_Debug = params.m_PrintIntermediateLayers; + const auto optimization_start_time = armnn::GetTimeNow(); optNet = armnn::Optimize(*network, params.m_ComputeDevices, m_Runtime->GetDeviceSpec(), options); + + ARMNN_LOG(info) << "Optimization time: " << std::setprecision(2) + << std::fixed << armnn::GetTimeDuration(optimization_start_time).count() << " ms\n"; + if (!optNet) { throw armnn::Exception("Optimize returned nullptr"); @@ -494,13 +504,13 @@ public: } // Start timer to record inference time in EnqueueWorkload (in milliseconds) - const auto start_time = GetCurrentTime(); + const auto start_time = armnn::GetTimeNow(); armnn::Status ret = m_Runtime->EnqueueWorkload(m_NetworkIdentifier, MakeInputTensors(inputContainers), MakeOutputTensors(outputContainers)); - const auto end_time = GetCurrentTime(); + const auto duration = armnn::GetTimeDuration(start_time); // if profiling is enabled print out the results if (profiler && profiler->IsProfilingEnabled()) @@ -514,7 +524,7 @@ public: } else { - return std::chrono::duration(end_time - start_time); + return duration; } } @@ -584,17 +594,4 @@ private: { return armnnUtils::MakeOutputTensors(m_OutputBindings, outputDataContainers); } - - std::chrono::high_resolution_clock::time_point GetCurrentTime() - { - return std::chrono::high_resolution_clock::now(); - } - - std::chrono::duration GetTimeDuration( - std::chrono::high_resolution_clock::time_point& start_time, - std::chrono::high_resolution_clock::time_point& end_time) - { - return std::chrono::duration(end_time - start_time); - } - }; diff --git a/tests/NetworkExecutionUtils/NetworkExecutionUtils.hpp b/tests/NetworkExecutionUtils/NetworkExecutionUtils.hpp index a922228689..ec0eaf90f8 100644 --- a/tests/NetworkExecutionUtils/NetworkExecutionUtils.hpp +++ b/tests/NetworkExecutionUtils/NetworkExecutionUtils.hpp @@ -4,6 +4,7 @@ // #include #include +#include #if defined(ARMNN_SERIALIZER) #include "armnnDeserializer/IDeserializer.hpp" @@ -378,7 +379,8 @@ struct ExecuteNetworkParams template int MainImpl(const ExecuteNetworkParams& params, - const std::shared_ptr& runtime = nullptr) + const std::shared_ptr& runtime = nullptr, + size_t iterations = 1) { using TContainer = boost::variant, std::vector, std::vector>; @@ -473,44 +475,47 @@ int MainImpl(const ExecuteNetworkParams& params, } } - // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds) - auto inference_duration = model.Run(inputDataContainers, outputDataContainers); - - if (params.m_GenerateTensorData) + for (size_t x = 0; x < iterations; x++) { - ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful"; - } + // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds) + auto inference_duration = model.Run(inputDataContainers, outputDataContainers); - // Print output tensors - const auto& infosOut = model.GetOutputBindingInfos(); - for (size_t i = 0; i < numOutputs; i++) - { - const armnn::TensorInfo& infoOut = infosOut[i].second; - auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i]; - - TensorPrinter printer(inferenceModelParams.m_OutputBindings[i], - infoOut, - outputTensorFile, - params.m_DequantizeOutput); - boost::apply_visitor(printer, outputDataContainers[i]); - } + if (params.m_GenerateTensorData) + { + ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful"; + } - ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2) - << std::fixed << inference_duration.count() << " ms"; + // Print output tensors + const auto& infosOut = model.GetOutputBindingInfos(); + for (size_t i = 0; i < numOutputs; i++) + { + const armnn::TensorInfo& infoOut = infosOut[i].second; + auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i]; + + TensorPrinter printer(inferenceModelParams.m_OutputBindings[i], + infoOut, + outputTensorFile, + params.m_DequantizeOutput); + boost::apply_visitor(printer, outputDataContainers[i]); + } - // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line - if (params.m_ThresholdTime != 0.0) - { - ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2) - << std::fixed << params.m_ThresholdTime << " ms"; - auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count(); - ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2) - << std::fixed << thresholdMinusInference << " ms" << "\n"; + ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2) + << std::fixed << inference_duration.count() << " ms\n"; - if (thresholdMinusInference < 0) + // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line + if (params.m_ThresholdTime != 0.0) { - std::string errorMessage = "Elapsed inference time is greater than provided threshold time."; - ARMNN_LOG(fatal) << errorMessage; + ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2) + << std::fixed << params.m_ThresholdTime << " ms"; + auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count(); + ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2) + << std::fixed << thresholdMinusInference << " ms" << "\n"; + + if (thresholdMinusInference < 0) + { + std::string errorMessage = "Elapsed inference time is greater than provided threshold time."; + ARMNN_LOG(fatal) << errorMessage; + } } } } @@ -545,6 +550,7 @@ int RunTest(const std::string& format, const size_t subgraphId, bool enableLayerDetails = false, bool parseUnsupported = false, + const size_t iterations = 1, const std::shared_ptr& runtime = nullptr) { std::string modelFormat = armnn::stringUtils::StringTrimCopy(format); @@ -682,34 +688,34 @@ int RunTest(const std::string& format, if (modelFormat.find("armnn") != std::string::npos) { #if defined(ARMNN_SERIALIZER) - return MainImpl(params, runtime); + return MainImpl(params, runtime, iterations); #else ARMNN_LOG(fatal) << "Not built with serialization support."; - return EXIT_FAILURE; + return EXIT_FAILURE; #endif } else if (modelFormat.find("caffe") != std::string::npos) { #if defined(ARMNN_CAFFE_PARSER) - return MainImpl(params, runtime); + return MainImpl(params, runtime, iterations); #else ARMNN_LOG(fatal) << "Not built with Caffe parser support."; return EXIT_FAILURE; #endif } else if (modelFormat.find("onnx") != std::string::npos) -{ + { #if defined(ARMNN_ONNX_PARSER) - return MainImpl(params, runtime); + return MainImpl(params, runtime, iterations); #else ARMNN_LOG(fatal) << "Not built with Onnx parser support."; - return EXIT_FAILURE; + return EXIT_FAILURE; #endif } else if (modelFormat.find("tensorflow") != std::string::npos) { #if defined(ARMNN_TF_PARSER) - return MainImpl(params, runtime); + return MainImpl(params, runtime, iterations); #else ARMNN_LOG(fatal) << "Not built with Tensorflow parser support."; return EXIT_FAILURE; @@ -720,21 +726,21 @@ int RunTest(const std::string& format, #if defined(ARMNN_TF_LITE_PARSER) if (! isModelBinary) { - ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat << "'. Only 'binary' format supported \ - for tflite files"; + ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat + << "'. Only 'binary' format supported for tflite files"; return EXIT_FAILURE; } - return MainImpl(params, runtime); + return MainImpl(params, runtime, iterations); #else - ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat << - "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'"; + ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat + << "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'"; return EXIT_FAILURE; #endif } else { - ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat << - "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'"; + ARMNN_LOG(fatal) << "Unknown model format: '" << modelFormat + << "'. Please include 'caffe', 'tensorflow', 'tflite' or 'onnx'"; return EXIT_FAILURE; } } @@ -864,3 +870,57 @@ int RunCsvTest(const armnnUtils::CsvRow &csvRow, const std::shared_ptr& computeDevices, + const std::string& dynamicBackendsPath, + const std::string& modelPath, + const std::string& inputNames, + const std::string& inputTensorDataFilePaths, + const std::string& inputTypes, + bool quantizeInput, + const std::string& outputTypes, + const std::string& outputNames, + const std::string& outputTensorFiles, + bool dequantizeOutput, + bool enableProfiling, + bool enableFp16TurboMode, + bool enableBf16TurboMode, + const double& thresholdTime, + bool printIntermediate, + const size_t subgraphId, + bool enableLayerDetails = false, + bool parseUnsupported = false) +{ + armnn::IRuntime::CreationOptions options; + options.m_BackendOptions.emplace_back( + armnn::BackendOptions + { + "GpuAcc", + { + {"TuningLevel", tuningLevel}, + {"TuningFile", tuningPath.c_str()}, + {"KernelProfilingEnabled", enableProfiling} + } + } + ); + + std::shared_ptr runtime(armnn::IRuntime::Create(options)); + const auto start_time = armnn::GetTimeNow(); + + ARMNN_LOG(info) << "Tuning run...\n"; + int state = RunTest(modelFormat, inputTensorShapes, computeDevices, dynamicBackendsPath, modelPath, inputNames, + inputTensorDataFilePaths, inputTypes, quantizeInput, outputTypes, outputNames, + outputTensorFiles, dequantizeOutput, enableProfiling, enableFp16TurboMode, enableBf16TurboMode, + thresholdTime, printIntermediate, subgraphId, enableLayerDetails, parseUnsupported, 1, runtime); + + ARMNN_LOG(info) << "Tuning time: " << std::setprecision(2) + << std::fixed << armnn::GetTimeDuration(start_time).count() << " ms\n"; + + return state; +} +#endif \ No newline at end of file -- cgit v1.2.1