From c577f2c6a3b4ddb6ba87a882723c53a248afbeba Mon Sep 17 00:00:00 2001
From: telsoa01 <telmo.soares@arm.com>
Date: Fri, 31 Aug 2018 09:22:23 +0100
Subject: Release 18.08

---
 src/armnn/Profiling.cpp | 455 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 346 insertions(+), 109 deletions(-)

(limited to 'src/armnn/Profiling.cpp')
diff --git a/src/armnn/Profiling.cpp b/src/armnn/Profiling.cpp
index 15a195e6bd..f70f6a34d1 100644
--- a/src/armnn/Profiling.cpp
+++ b/src/armnn/Profiling.cpp
@@ -3,8 +3,7 @@
 // See LICENSE file in the project root for full license information.
 //
 #include "Profiling.hpp"
-
-#if ARMNN_PROFILING_ENABLED
+#include "JsonPrinter.hpp"
 
 #if ARMNN_STREAMLINE_ENABLED
 #include <streamline_annotate.h>
@@ -17,10 +16,12 @@
 #include <algorithm>
 #include <iomanip>
 #include <iostream>
+#include <fstream>
 #include <map>
 #include <stack>
-#include <boost/algorithm/string.hpp>
 
+#include <boost/algorithm/string.hpp>
+#include <boost/core/ignore_unused.hpp>
 namespace armnn
 {
 
@@ -32,86 +33,128 @@ constexpr std::size_t g_ProfilingEventCountHint = 1024;
 // Whether profiling reports should include the sequence of events together with their timings.
 constexpr bool g_WriteProfilingEventSequence = true;
 
-// Whether profiling reports should also report detailed information on events grouped by tag.
-// This is used to group stats per inference (see usage of ARMNN_UPDATE_PROFILING_EVENT_TAG in
-// Runtime::EnqueueWorkload). This can spam the output stream, so use carefully (or adapt
-// the code to just output information for a tag of interest).
-constexpr bool g_AggregateProfilingEventsByTag = false;
+// Whether profiling reports should also report detailed information on events grouped by inference.
+// This can spam the output stream, so use carefully (or adapt the code to just output information
+// of interest).
+constexpr bool g_AggregateProfilingEventsByInference = true;
 
-// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler
-// singleton is destroyed. It can be convenient for local tests.
-constexpr bool g_WriteReportToStdOutOnProfilerDestruction = true;
+// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler is destroyed.
+// It can be convenient for local tests.
+constexpr bool g_WriteReportToStdOutOnProfilerDestruction = false;
 
 // Whether events denoting operations running on the GPU should force a sync before/after the event.
 // This is hardcoded to true for now as the profiling timings are not very useful without it.
+#if ARMCOMPUTECL_ENABLED
 constexpr bool g_ProfilingForceGpuSync = true;
+#endif
+
+Measurement FindMeasurement(const std::string& name, const Event* event)
+{
+
+    BOOST_ASSERT(event != nullptr);
+
+    // Search though the measurements.
+    for (const auto& measurement : event->GetMeasurements())
+    {
+        if (measurement.m_Name == name)
+        {
+            // Measurement found.
+            return measurement;
+        }
+    }
+
+    // Measurement not found.
+    return Measurement{ "", 0.f, Measurement::Unit::TIME_MS };
+}
+
+std::vector<Measurement> FindKernelMeasurements(const Event* event)
+{
+    BOOST_ASSERT(event != nullptr);
+
+    std::vector<Measurement> measurements;
+
+    // Search through the measurements.
+    for (const auto& measurement : event->GetMeasurements())
+    {
+        if (measurement.m_Name.rfind("OpenClKernelTimer", 0) == 0
+            || measurement.m_Name.rfind("NeonKernelTimer", 0) == 0)
+        {
+            // Measurement found.
+            measurements.push_back(measurement);
+        }
+    }
+
+    return measurements;
+}
 
 std::map<std::string, Profiler::ProfilingEventStats> Profiler::CalculateProfilingEventStats() const
 {
     std::map<std::string, ProfilingEventStats> nameToStatsMap;
 
-    for (auto&& event : m_EventSequence)
+    for (const auto& event : m_EventSequence)
     {
-        auto mapIter = nameToStatsMap.find(event.m_Label);
-        if (mapIter != nameToStatsMap.end())
+        Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, event.get());
+
+        double durationMs = measurement.m_Value;
+        auto it = nameToStatsMap.find(event->GetName());
+        if (it != nameToStatsMap.end())
         {
-            ProfilingEventStats& stats = mapIter->second;
-            stats.m_TotalMs += event.DurationMs();
-            stats.m_MinMs = std::min(stats.m_MinMs, event.DurationMs());
-            stats.m_MaxMs = std::max(stats.m_MaxMs, event.DurationMs());
+            ProfilingEventStats& stats = it->second;
+            stats.m_TotalMs += durationMs;
+            stats.m_MinMs = std::min(stats.m_MinMs, durationMs);
+            stats.m_MaxMs = std::max(stats.m_MaxMs, durationMs);
             ++stats.m_Count;
         }
         else
         {
-            ProfilingEventStats stats;
-            stats.m_TotalMs = event.DurationMs();
-            stats.m_MinMs = event.DurationMs();
-            stats.m_MaxMs = event.DurationMs();
-            stats.m_Count = 1;
-
-            nameToStatsMap[event.m_Label] = stats;
+            nameToStatsMap.emplace(event->GetName(), ProfilingEventStats{ durationMs, durationMs, durationMs, 1 });
         }
     }
 
     return nameToStatsMap;
 }
 
-void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector<ProfilingEvent>::const_iterator first,
-                                                   std::vector<ProfilingEvent>::const_iterator last,
-                                                   std::ostream& outStream) const
+const Event* GetEventPtr(const Event* ptr) { return ptr;}
+const Event* GetEventPtr(const std::unique_ptr<Event>& ptr) {return ptr.get(); }
+
+template<typename ItertType>
+void Profiler::AnalyzeEventSequenceAndWriteResults(ItertType first, ItertType last, std::ostream& outStream) const
 {
-    // Output event sequence, if needed
+    // Outputs event sequence, if needed.
     if (g_WriteProfilingEventSequence)
     {
-        // Make sure timestamps are output with 6 decimals, and save old settings
+        // Makes sure timestamps are output with 6 decimals, and save old settings.
         std::streamsize oldPrecision = outStream.precision();
         outStream.precision(6);
         std::ios_base::fmtflags oldFlags = outStream.flags();
         outStream.setf(std::ios::fixed);
-        // Output fields
+        // Outputs fields.
         outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl;
         for (auto event = first; event != last; ++event)
         {
-            std::chrono::duration<double, std::milli> startTimeMs = event->m_StartTime.time_since_epoch();
-            std::chrono::duration<double, std::milli> stopTimeMs = event->m_StopTime.time_since_epoch();
-
-            outStream << std::setw(50) << event->m_Label << " "
-                << std::setw(20) << event->DurationMs()
-                << std::setw(20) << startTimeMs.count()
-                << std::setw(20) << stopTimeMs.count()
-                << std::setw(20) << Profiler::Get().GetEventComputeDevice(event->m_Device)
-                << std::endl;
+            const Event* eventPtr = GetEventPtr((*event));
+            double startTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_START, eventPtr).m_Value;
+            double stopTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_STOP, eventPtr).m_Value;
+
+            // Find the WallClock measurement if there is one.
+            double durationMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, eventPtr).m_Value;
+            outStream << std::setw(50) << eventPtr->GetName() << " "
+                      << std::setw(20) << durationMs
+                      << std::setw(20) << startTimeMs
+                      << std::setw(20) << stopTimeMs
+                      << std::setw(20) << GetComputeDeviceAsCString(eventPtr->GetComputeDevice())
+                      << std::endl;
         }
         outStream << std::endl;
-        // Restore previous precision settings
+        // Restores previous precision settings.
         outStream.flags(oldFlags);
         outStream.precision(oldPrecision);
     }
 
-    // Aggregate results per event name
+    // Aggregates results per event name.
     std::map<std::string, ProfilingEventStats> nameToStatsMap = CalculateProfilingEventStats();
 
-    // Output aggregated stats
+    // Outputs aggregated stats.
     outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl;
     for (const auto& pair : nameToStatsMap)
     {
@@ -126,74 +169,236 @@ void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector<ProfilingEvent>::
     outStream << std::endl;
 }
 
-Profiler Profiler::s_Instance;
-
 Profiler::Profiler()
-    : m_EventTag(0)
-    , m_NestingLevel(0)
-    , m_EventTagUpdated(false)
+    : m_ProfilingEnabled(false)
 {
     m_EventSequence.reserve(g_ProfilingEventCountHint);
 
 #if ARMNN_STREAMLINE_ENABLED
-    // Initialise streamline annotations
+    // Initialises streamline annotations.
     ANNOTATE_SETUP;
 #endif
 }
 
 Profiler::~Profiler()
 {
-    if (g_WriteReportToStdOutOnProfilerDestruction)
+    if (m_ProfilingEnabled)
     {
-        AnalyzeEventsAndWriteResults(std::cout);
+        if (g_WriteReportToStdOutOnProfilerDestruction)
+        {
+            Print(std::cout);
+        }
     }
+
+    // Un-register this profiler from the current thread.
+    ProfilerManager::GetInstance().RegisterProfiler(nullptr);
 }
 
-void Profiler::BeginEvent(Compute compute, const std::string label)
+bool Profiler::IsProfilingEnabled()
+{
+    return m_ProfilingEnabled;
+}
+
+void Profiler::EnableProfiling(bool enableProfiling)
+{
+    m_ProfilingEnabled = enableProfiling;
+}
+
+Event* Profiler::BeginEvent(Compute compute, const std::string& label, std::vector<InstrumentPtr>&& instruments)
 {
     // We need to sync just before the begin event to not include time before the period we want to time.
     WaitForDevice(compute);
 
-    const TimePoint timeStamp = Clock::now();
-    m_ObservedMarkers.emplace(Marker{m_EventSequence.size(), label, timeStamp, compute, m_EventTag});
-    m_EventSequence.emplace_back();
+    Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
+    m_EventSequence.push_back(std::make_unique<Event>(label, this, parent, compute, std::move(instruments)));
+    Event* event = m_EventSequence.back().get();
+    event->Start();
 
 #if ARMNN_STREAMLINE_ENABLED
-    ANNOTATE_CHANNEL_COLOR(m_NestingLevel, GetEventColor(compute), label.c_str());
+    ANNOTATE_CHANNEL_COLOR(m_Parents.size(), GetEventColor(compute), label.c_str());
 #endif
 
-    m_NestingLevel++;
+    m_Parents.push(event);
+    return event;
 }
 
-void Profiler::EndEvent(Compute compute)
+void Profiler::EndEvent(Event* event)
 {
-    // We need to sync just before the end event to include all the time of the timed period.
-    WaitForDevice(compute);
-
-    const Marker& marker = m_ObservedMarkers.top();
+    event->Stop();
 
-    const TimePoint startTime = marker.m_TimeStamp;
-    const TimePoint stopTime = Clock::now();
+    BOOST_ASSERT(!m_Parents.empty());
+    BOOST_ASSERT(event == m_Parents.top());
+    m_Parents.pop();
 
-    m_EventSequence[marker.m_Id] = {std::move(marker.m_EventName),
-                                    startTime,
-                                    stopTime,
-                                    marker.m_ComputeDevice,
-                                    marker.m_Tag};
-
-    m_ObservedMarkers.pop();
+    Event* parent = m_Parents.empty() ? nullptr : m_Parents.top();
+    boost::ignore_unused(parent);
+    BOOST_ASSERT(event->GetParentEvent() == parent);
 
 #if ARMNN_STREAMLINE_ENABLED
-    ANNOTATE_CHANNEL_END(m_NestingLevel);
+    ANNOTATE_CHANNEL_END(m_Parents.size());
 #endif
+}
+
+int CalcLevel(const Event* eventPtr)
+{
+    int level=0;
+    while (eventPtr != nullptr)
+    {
+        eventPtr = eventPtr->GetParentEvent();
+        level++;
+    }
+    return level;
+}
+
+void Profiler::PopulateInferences(std::vector<const Event*>& outInferences, int& outBaseLevel) const
+{
+    outInferences.reserve(m_EventSequence.size());
+    for (const auto& event : m_EventSequence)
+    {
+        const Event* eventPtrRaw = event.get();
+        if (eventPtrRaw->GetName() == "EnqueueWorkload")
+        {
+            outBaseLevel = (outBaseLevel == -1) ? CalcLevel(eventPtrRaw) : outBaseLevel;
+            outInferences.push_back(eventPtrRaw);
+        }
+    }
+}
+
+void Profiler::PopulateDescendants(std::map<const Event*, std::vector<const Event*>>& outDescendantsMap) const
+{
+    for (const auto& event : m_EventSequence)
+    {
+        const Event* eventPtrRaw = event.get();
+        const Event* parent = eventPtrRaw->GetParentEvent();
+
+        if (!parent)
+        {
+            continue;
+        }
+
+        auto it = outDescendantsMap.find(parent);
+        if (it == outDescendantsMap.end())
+        {
+            outDescendantsMap.emplace(parent, std::vector<const Event*>({eventPtrRaw}));
+        }
+        else
+        {
+            it->second.push_back(eventPtrRaw);
+        }
+    }
+}
+
+void Profiler::Print(std::ostream& outStream) const
+{
+    // Makes sure timestamps are output with 6 decimals, and save old settings.
+    std::streamsize oldPrecision = outStream.precision();
+    outStream.precision(6);
+    std::ios_base::fmtflags oldFlags = outStream.flags();
+    outStream.setf(std::ios::fixed);
+    JsonPrinter printer(outStream);
+
+    // First find all the "inference" Events and print out duration measurements.
+    int baseLevel = -1;
+    std::vector<const Event*> inferences;
+    PopulateInferences(inferences, baseLevel);
+
+    // Second map out descendants hierarchy
+    std::map<const Event*, std::vector<const Event*>> descendantsMap;
+    PopulateDescendants(descendantsMap);
+
+    JsonChildObject inferenceObject{"inference_measurements"};
+    JsonChildObject layerObject{"layer_measurements"};
+    std::vector<JsonChildObject> workloadObjects;
+    std::map<unsigned int, std::vector<JsonChildObject>> workloadToKernelObjects;
+
+    for (unsigned int inferenceIndex = 0; inferenceIndex < inferences.size(); ++inferenceIndex)
+    {
+        auto inference = inferences[inferenceIndex];
+        Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, inference);
+        inferenceObject.SetUnit(measurement.m_Unit);
+        inferenceObject.AddMeasurement(measurement.m_Value);
+
+        auto layerEventsIt = descendantsMap.find(inference);
+
+        // Assuming 1 Execute per inference
+        if (layerEventsIt != descendantsMap.end())
+        {
+            auto layerEvent = layerEventsIt->second[0];
+            Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, layerEvent);
+            layerObject.SetUnit(measurement.m_Unit);
+            layerObject.AddMeasurement(measurement.m_Value);
+
+            // Get Descendant Events for Execute
+            auto workloadEventsIt = descendantsMap.find(layerEvent);
+            for(unsigned int workloadIndex = 0; workloadIndex < workloadEventsIt->second.size(); ++workloadIndex)
+            {
+                auto workloadEvent = workloadEventsIt->second[workloadIndex];
+                Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, workloadEvent);
+                std::vector<Measurement> kernelMeasurements = FindKernelMeasurements(workloadEvent);
+                if (inferenceIndex == 0)
+                {
+                    // Only add second level once, in case of multiple inferences
+                    JsonChildObject workloadObject{workloadEvent->GetName()};
+                    workloadObject.SetUnit(measurement.m_Unit);
+                    workloadObjects.push_back(workloadObject);
+                }
+                workloadObjects[workloadIndex].AddMeasurement(measurement.m_Value);
+
+                for(unsigned int kernelIndex = 0; kernelIndex < kernelMeasurements.size(); ++kernelIndex)
+                {
+                    if (inferenceIndex == 0)
+                    {
+                        // Only add kernel measurement once, in case of multiple inferences
+                        JsonChildObject kernelObject{kernelMeasurements[kernelIndex].m_Name};
+                        kernelObject.SetUnit(kernelMeasurements[kernelIndex].m_Unit);
+                        workloadToKernelObjects[workloadIndex].push_back(kernelObject);
+
+                    }
+                    workloadToKernelObjects[workloadIndex][kernelIndex].
+                            AddMeasurement(kernelMeasurements[kernelIndex].m_Value);
+                }
+            }
+        }
+    }
+
+    for (auto workloadToKernelPair : workloadToKernelObjects)
+    {
+        for (auto kernelObject : workloadToKernelPair.second)
+        {
+            workloadObjects[workloadToKernelPair.first].AddChild(kernelObject);
+        }
+    }
 
-    m_NestingLevel--;
+    for (auto workloadObject : workloadObjects)
+    {
+        layerObject.AddChild(workloadObject);
+    }
+    inferenceObject.AddChild(layerObject);
+
+    printer.PrintHeader();
+    printer.PrintArmNNHeader();
+
+    // print inference object, also prints child layer and kernel measurements
+    printer.PrintJsonChildObject(inferenceObject);
+
+    // end of ArmNN
+    printer.PrintNewLine();
+    printer.PrintFooter();
+
+    // end of main JSON object
+    printer.PrintNewLine();
+    printer.PrintFooter();
+    printer.PrintNewLine();
+
+    // Restores previous precision settings.
+    outStream.flags(oldFlags);
+    outStream.precision(oldPrecision);
 }
 
 void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const
 {
     // Stack should be empty now.
-    const bool saneMarkerSequence = m_ObservedMarkers.empty();
+    const bool saneMarkerSequence = m_Parents.empty();
 
     // Abort if the sequence of markers was found to have incorrect information:
     // The stats cannot be trusted.
@@ -206,39 +411,69 @@ void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const
         return;
     }
 
-    // Analyze the full sequence of events
-    AnalyzeEventSequenceAndWriteResults(m_EventSequence.begin(), m_EventSequence.end(), outStream);
+    // Analyzes the full sequence of events.
+    AnalyzeEventSequenceAndWriteResults(m_EventSequence.cbegin(),
+                                        m_EventSequence.cend(),
+                                        outStream);
 
-    // Aggregate events by tag if requested (spams the output stream if done for all tags)
-    if (m_EventTagUpdated && g_AggregateProfilingEventsByTag)
+    // Aggregates events by tag if requested (spams the output stream if done for all tags).
+    if (g_AggregateProfilingEventsByInference)
     {
         outStream << std::endl;
         outStream << "***" << std::endl;
-        outStream << "*** Per Tag Stats" << std::endl;
+        outStream << "*** Per Inference Stats" << std::endl;
         outStream << "***" << std::endl;
         outStream << std::endl;
 
-        for (auto iter = m_EventSequence.begin(); iter != m_EventSequence.end();)
-        {
-            const uint32_t tag = iter->m_Tag;
+        int baseLevel = -1;
+        std::vector<const Event*> inferences;
+        PopulateInferences(inferences, baseLevel);
 
-            // Advance iter until we find the first non-matching tag
-            auto tagEndIter = iter;
-            for (; tagEndIter != m_EventSequence.end(); ++tagEndIter)
+        // Second map out descendants hierarchy
+        std::map<const Event*, std::vector<const Event*>> descendantsMap;
+        PopulateDescendants(descendantsMap);
+
+        std::function<void (const Event*, std::vector<const Event*>&)>
+            FindDescendantEvents = [&](const Event* eventPtr,
+                std::vector<const Event*>& sequence)
             {
-                if (tagEndIter->m_Tag != tag)
+                sequence.push_back(eventPtr);
+
+                if (CalcLevel(eventPtr) > baseLevel+2) //We only care about levels as deep as workload executions.
                 {
-                    break;
+                    return;
                 }
-            }
 
-            outStream << "> Begin Tag: " << tag << std::endl;
+                auto children = descendantsMap.find(eventPtr);
+                if (children == descendantsMap.end())
+                {
+                    return;
+                }
+
+                for (const Event* child : children->second)
+                {
+                    return FindDescendantEvents(child, sequence);
+                }
+            };
+
+        // Third, find events belonging to each inference
+        int inferenceIdx = 0;
+        for (auto inference : inferences)
+        {
+            std::vector<const Event*> sequence;
+
+            //build sequence, depth first
+            FindDescendantEvents(inference, sequence);
+
+            outStream << "> Begin Inference: " << inferenceIdx << std::endl;
             outStream << std::endl;
-            AnalyzeEventSequenceAndWriteResults(iter, tagEndIter, outStream);
+            AnalyzeEventSequenceAndWriteResults(sequence.cbegin(),
+                                                sequence.cend(),
+                                                outStream);
             outStream << std::endl;
-            outStream << "> End Tag: " << tag << std::endl;
+            outStream << "> End Inference: " << inferenceIdx << std::endl;
 
-            iter = tagEndIter;
+            inferenceIdx++;
         }
     }
 }
@@ -253,21 +488,6 @@ void Profiler::WaitForDevice(Compute compute) const
 #endif
 }
 
-const char* Profiler::GetEventComputeDevice(Compute compute) const
-{
-    switch(compute)
-    {
-        case Compute::CpuRef:
-            return "CpuRef";
-        case Compute::CpuAcc:
-            return "CpuAcc";
-        case Compute::GpuAcc:
-            return "GpuAcc";
-        default:
-            return "Undefined";
-    }
-}
-
 std::uint32_t Profiler::GetEventColor(Compute compute) const
 {
     switch(compute)
@@ -287,7 +507,24 @@ std::uint32_t Profiler::GetEventColor(Compute compute) const
     }
 }
 
-} // namespace armnn
+// The thread_local pointer to the profiler instance.
+thread_local Profiler* tl_Profiler = nullptr;
+
+ProfilerManager& ProfilerManager::GetInstance()
+{
+    // Global reference to the single ProfileManager instance allowed.
+    static ProfilerManager s_ProfilerManager;
+    return s_ProfilerManager;
+}
+
+void ProfilerManager::RegisterProfiler(Profiler* profiler)
+{
+    tl_Profiler = profiler;
+}
 
-#endif // ARMNN_PROFILING_ENABLED
+Profiler* ProfilerManager::GetProfiler()
+{
+    return tl_Profiler;
+}
 
+} // namespace armnn
-- 
cgit v1.2.1