From 4c11a488d7f20c219ea6265480bc02f4b2cea1e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonny=20Sv=C3=A4rd?= Date: Fri, 17 Dec 2021 17:04:08 +0100 Subject: Use lbl-profiler to setup PMU and PMU events Adapt layer-by-layer profiler to handle PMU event configuration, PMU setup and retrieving PMU counter values. Adapt the inference process application to support the lbl-profiler PMU setup and retrieve/save PMU counter values in the InferenceJob struct. Change-Id: I1667a5b11c43c54e7d28232b594dd118bf3f79a8 --- .../include/inference_process.hpp | 4 +- .../inference_process/src/inference_process.cpp | 29 +++--- .../include/layer_by_layer_profiler.hpp | 23 +++-- .../src/layer_by_layer_profiler.cpp | 113 ++++++++++++++++----- 4 files changed, 120 insertions(+), 49 deletions(-) diff --git a/applications/inference_process/include/inference_process.hpp b/applications/inference_process/include/inference_process.hpp index db57811..1679e19 100644 --- a/applications/inference_process/include/inference_process.hpp +++ b/applications/inference_process/include/inference_process.hpp @@ -43,7 +43,7 @@ struct InferenceJob { std::vector expectedOutput; size_t numBytesToPrint; std::vector pmuEventConfig; - uint32_t pmuCycleCounterEnable; + bool pmuCycleCounterEnable; std::vector pmuEventCount; uint64_t pmuCycleCounterCount; @@ -55,7 +55,7 @@ struct InferenceJob { const std::vector &expectedOutput, size_t numBytesToPrint, const std::vector &pmuEventConfig, - const uint32_t pmuCycleCounterEnable); + const bool pmuCycleCounterEnable); void invalidate(); void clean(); diff --git a/applications/inference_process/src/inference_process.cpp b/applications/inference_process/src/inference_process.cpp index 4990e62..94c62d3 100644 --- a/applications/inference_process/src/inference_process.cpp +++ b/applications/inference_process/src/inference_process.cpp @@ -80,10 +80,10 @@ private: }; void print_output_data(TfLiteTensor *output, size_t bytesToPrint) { - constexpr auto crc = Crc(); + constexpr auto crc = Crc(); const uint32_t output_crc32 = crc.crc32(output->data.data, output->bytes); - const int numBytesToPrint = min(output->bytes, bytesToPrint); - int dims_size = output->dims->size; + const int numBytesToPrint = min(output->bytes, bytesToPrint); + int dims_size = output->dims->size; LOG("{\n"); LOG("\"dims\": [%d,", dims_size); for (int i = 0; i < output->dims->size - 1; ++i) { @@ -91,15 +91,14 @@ void print_output_data(TfLiteTensor *output, size_t bytesToPrint) { } LOG("%d],\n", output->dims->data[dims_size - 1]); LOG("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data); - if (numBytesToPrint) - { + if (numBytesToPrint) { LOG("\"crc32\": \"%08" PRIx32 "\",\n", output_crc32); LOG("\"data\":\""); for (int i = 0; i < numBytesToPrint - 1; ++i) { /* - * Workaround an issue when compiling with GCC where by - * printing only a '\n' the produced global output is wrong. - */ + * Workaround an issue when compiling with GCC where by + * printing only a '\n' the produced global output is wrong. + */ if (i % 15 == 0 && i != 0) { LOG("0x%02x,\n", output->data.uint8[i]); } else { @@ -107,8 +106,7 @@ void print_output_data(TfLiteTensor *output, size_t bytesToPrint) { } } LOG("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]); - } - else { + } else { LOG("\"crc32\": \"%08" PRIx32 "\"\n", output_crc32); } LOG("}"); @@ -156,7 +154,7 @@ InferenceJob::InferenceJob(const string &_name, const vector &_expectedOutput, size_t _numBytesToPrint, const vector &_pmuEventConfig, - const uint32_t _pmuCycleCounterEnable) : + const bool _pmuCycleCounterEnable) : name(_name), networkModel(_networkModel), input(_input), output(_output), expectedOutput(_expectedOutput), numBytesToPrint(_numBytesToPrint), pmuEventConfig(_pmuEventConfig), pmuCycleCounterEnable(_pmuCycleCounterEnable), @@ -250,7 +248,7 @@ bool InferenceProcess::runJob(InferenceJob &job) { // Create the TFL micro interpreter tflite::AllOpsResolver resolver; #ifdef LAYER_BY_LAYER_PROFILER - tflite::LayerByLayerProfiler profiler; + tflite::LayerByLayerProfiler profiler(job.pmuEventConfig, job.pmuCycleCounterEnable); #else tflite::ArmProfiler profiler; #endif @@ -308,9 +306,12 @@ bool InferenceProcess::runJob(InferenceJob &job) { LOG("Inference runtime: %u cycles\n", (unsigned int)profiler.GetTotalTicks()); - if (job.pmuCycleCounterEnable != 0) { - job.pmuCycleCounterCount = profiler.GetTotalTicks(); +#ifdef LAYER_BY_LAYER_PROFILER + if (job.pmuCycleCounterEnable) { + job.pmuCycleCounterCount = profiler.GetPmuCycleCounterCount(); } + job.pmuEventCount.assign(profiler.GetPmuEventCount().begin(), profiler.GetPmuEventCount().end()); +#endif // Copy output data if (job.output.size() > 0) { diff --git a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp index 8e8dc0f..37bd868 100644 --- a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp +++ b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp @@ -23,6 +23,7 @@ #include "tensorflow/lite/kernels/internal/compatibility.h" #include #include +#include // NOTE: This profiler only works on systems with 1 NPU due to the use of // ethosu_reserve_driver(). @@ -30,22 +31,32 @@ namespace tflite { class LayerByLayerProfiler : public MicroProfiler { public: enum Backend { PRINTF, EVENT_RECORDER }; - LayerByLayerProfiler(size_t max_events = 200, - Backend backend = PRINTF, - int32_t event_id = EventID(EventLevelError, EvtStatistics_No, EventRecordNone)); + LayerByLayerProfiler(const std::vector &event_config = {}, + bool pmu_cycle_counter_enable = true, + size_t max_events = 200, + Backend backend = PRINTF, + int32_t event_id = EventID(EventLevelError, EvtStatistics_No, EventRecordNone)); uint32_t BeginEvent(const char *tag); void EndEvent(uint32_t event_handle); uint64_t GetTotalTicks() const; void Log() const; + uint64_t GetPmuCycleCounterCount() const; + const std::vector &GetPmuEventCount() const; + private: - size_t max_events_; std::unique_ptr tags_; std::unique_ptr start_ticks_; std::unique_ptr end_ticks_; - Backend backend_; - int32_t event_id_; + std::vector pmu_event_config; + std::vector pmu_event_count; + bool pmu_cycle_counter_enable; + uint64_t pmu_cycle_counter_count; + + size_t max_events_; + Backend backend; + int32_t event_id; size_t num_events_; TF_LITE_REMOVE_VIRTUAL_DELETE; diff --git a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp index 0211414..a5b8e5a 100644 --- a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp +++ b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp @@ -39,17 +39,18 @@ uint64_t GetCurrentEthosuTicks(struct ethosu_driver *drv) { namespace tflite { -LayerByLayerProfiler::LayerByLayerProfiler(size_t max_events, Backend backend, int32_t event_id) : - max_events_(max_events), backend_(backend), event_id_(event_id), num_events_(0) { - - tags_ = std::make_unique(max_events_); - start_ticks_ = std::make_unique(max_events_); - end_ticks_ = std::make_unique(max_events_); - - struct ethosu_driver *drv = ethosu_reserve_driver(); - ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk); - ETHOSU_PMU_CYCCNT_Reset(drv); - ethosu_release_driver(drv); +LayerByLayerProfiler::LayerByLayerProfiler(const std::vector &event_config, + bool pmu_cycle_counter_enable, + size_t max_events, + Backend backend, + int32_t event_id) : + pmu_event_config(event_config), + pmu_event_count(), pmu_cycle_counter_enable(pmu_cycle_counter_enable), pmu_cycle_counter_count(0), + max_events_(max_events), backend(backend), event_id(event_id), num_events_(0) { + + tags_ = std::make_unique(max_events); + start_ticks_ = std::make_unique(max_events); + end_ticks_ = std::make_unique(max_events); } // NOTE: THIS PROFILER ONLY WORKS ON SYSTEMS WITH 1 NPU @@ -62,17 +63,44 @@ uint32_t LayerByLayerProfiler::BeginEvent(const char *tag) { tags_[num_events_] = tag; if (strcmp("ethos-u", tag) == 0) { - struct ethosu_driver *ethosu_drv = ethosu_reserve_driver(); - ETHOSU_PMU_CYCCNT_Reset(ethosu_drv); - ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(ethosu_drv, ETHOSU_PMU_NPU_ACTIVE); - ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(ethosu_drv, ETHOSU_PMU_NPU_IDLE); - start_ticks_[num_events_] = GetCurrentEthosuTicks(ethosu_drv); - ethosu_release_driver(ethosu_drv); + struct ethosu_driver *drv = ethosu_reserve_driver(); + size_t numEventCounters = ETHOSU_PMU_Get_NumEventCounters(); + + if (pmu_event_config.size() > numEventCounters) { + LOG_WARN("PMU event config list is bigger (%lu) than available PMU event counters (%lu)", + pmu_event_config.size(), + numEventCounters); + LOG_WARN("PMU event config list will be truncated"); + pmu_event_config.resize(numEventCounters); + } + // Enable PMU + ETHOSU_PMU_Enable(drv); + + for (size_t i = 0; i < pmu_event_config.size(); i++) { + ETHOSU_PMU_Set_EVTYPER(drv, i, static_cast(pmu_event_config[i])); + } + + ETHOSU_PMU_CNTR_Enable(drv, (1 << pmu_event_config.size()) - 1); + ETHOSU_PMU_EVCNTR_ALL_Reset(drv); + + // Configure the cycle counter + if (pmu_cycle_counter_enable) { + ETHOSU_PMU_CNTR_Disable(drv, ETHOSU_PMU_CCNT_Msk); + ETHOSU_PMU_CYCCNT_Reset(drv); + + ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE); + ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE); + + ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk); + } + start_ticks_[num_events_] = 0; // Hardware cycle counter has been reset above, thus starts at 0 + ethosu_release_driver(drv); } else { start_ticks_[num_events_] = GetCurrentTimeTicks(); } - end_ticks_[num_events_] = start_ticks_[num_events_] - 1; + end_ticks_[num_events_] = + start_ticks_[num_events_]; // NOTE: In case an EndEvent() doesn't trigger, cycles reports as 0 return num_events_++; } @@ -81,19 +109,42 @@ void LayerByLayerProfiler::EndEvent(uint32_t event_handle) { TFLITE_DCHECK(event_handle < max_events_); if (strcmp("ethos-u", tags_[event_handle]) == 0) { - struct ethosu_driver *ethosu_drv = ethosu_reserve_driver(); - end_ticks_[event_handle] = GetCurrentEthosuTicks(ethosu_drv); - ethosu_release_driver(ethosu_drv); + struct ethosu_driver *drv = ethosu_reserve_driver(); + + end_ticks_[event_handle] = GetCurrentEthosuTicks(drv); + // Get the cycle count + if (pmu_cycle_counter_enable) { + pmu_cycle_counter_count = end_ticks_[event_handle]; + } + + // Save the PMU counter values + // NOTE: If multiple ethos-u layers, only the latest will be saved + pmu_event_count.resize(pmu_event_config.size()); + for (size_t i = 0; i < pmu_event_config.size(); i++) { + pmu_event_count[i] = ETHOSU_PMU_Get_EVCNTR(drv, i); + } + + // Shut down the PMU + ETHOSU_PMU_Disable(drv); + + ethosu_release_driver(drv); } else { end_ticks_[event_handle] = GetCurrentTimeTicks(); } - if (backend_ == PRINTF) { - LOG("%s : cycle_cnt : %" PRIu64 " cycles\n", - tags_[event_handle], - end_ticks_[event_handle] - start_ticks_[event_handle]); + if (backend == PRINTF) { + if (strcmp("ethos-u", tags_[event_handle]) == 0) { + for (size_t i = 0; i < pmu_event_count.size(); i++) { + LOG("ethos-u : ethosu_pmu_cntr%lu : %u\n", i, pmu_event_count[i]); + } + LOG("ethos-u : cycle_cnt : %" PRIu64 " cycles\n", pmu_cycle_counter_count); + } else { + LOG("%s : cycle_cnt : %" PRIu64 " cycles\n", + tags_[event_handle], + end_ticks_[event_handle] - start_ticks_[event_handle]); + } } else { - EventRecord2(event_id_, (int32_t)event_handle, end_ticks_[event_handle] - start_ticks_[event_handle]); + EventRecord2(event_id, (int32_t)event_handle, end_ticks_[event_handle] - start_ticks_[event_handle]); } } @@ -107,10 +158,18 @@ uint64_t LayerByLayerProfiler::GetTotalTicks() const { return ticks; } +uint64_t LayerByLayerProfiler::GetPmuCycleCounterCount() const { + return pmu_cycle_counter_count; +} + +const std::vector &LayerByLayerProfiler::GetPmuEventCount() const { + return pmu_event_count; +} + void LayerByLayerProfiler::Log() const { #if !defined(TF_LITE_STRIP_ERROR_STRINGS) - if (backend_ == PRINTF) { + if (backend == PRINTF) { for (size_t i = 0; i < num_events_; ++i) { uint64_t ticks = end_ticks_[i] - start_ticks_[i]; LOG("%s took %" PRIu64 " cycles", tags_[i], ticks); -- cgit v1.2.1