aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonny Svärd <jonny.svaerd@arm.com>2021-12-17 17:04:08 +0100
committerJonny Svärd <jonny.svaerd@arm.com>2021-12-22 13:09:40 +0100
commit4c11a488d7f20c219ea6265480bc02f4b2cea1e3 (patch)
tree5c08668e3294112ed5fc494229c333fdc1a49fa9
parent05244b78be9eba0c83f6267fdbb3aa34e60eec98 (diff)
downloadethos-u-core-software-4c11a488d7f20c219ea6265480bc02f4b2cea1e3.tar.gz
Use lbl-profiler to setup PMU and PMU events
Adapt layer-by-layer profiler to handle PMU event configuration, PMU setup and retrieving PMU counter values. Adapt the inference process application to support the lbl-profiler PMU setup and retrieve/save PMU counter values in the InferenceJob struct. Change-Id: I1667a5b11c43c54e7d28232b594dd118bf3f79a8
-rw-r--r--applications/inference_process/include/inference_process.hpp4
-rw-r--r--applications/inference_process/src/inference_process.cpp29
-rw-r--r--lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp23
-rw-r--r--lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp113
4 files changed, 120 insertions, 49 deletions
diff --git a/applications/inference_process/include/inference_process.hpp b/applications/inference_process/include/inference_process.hpp
index db57811..1679e19 100644
--- a/applications/inference_process/include/inference_process.hpp
+++ b/applications/inference_process/include/inference_process.hpp
@@ -43,7 +43,7 @@ struct InferenceJob {
std::vector<DataPtr> expectedOutput;
size_t numBytesToPrint;
std::vector<uint8_t> pmuEventConfig;
- uint32_t pmuCycleCounterEnable;
+ bool pmuCycleCounterEnable;
std::vector<uint32_t> pmuEventCount;
uint64_t pmuCycleCounterCount;
@@ -55,7 +55,7 @@ struct InferenceJob {
const std::vector<DataPtr> &expectedOutput,
size_t numBytesToPrint,
const std::vector<uint8_t> &pmuEventConfig,
- const uint32_t pmuCycleCounterEnable);
+ const bool pmuCycleCounterEnable);
void invalidate();
void clean();
diff --git a/applications/inference_process/src/inference_process.cpp b/applications/inference_process/src/inference_process.cpp
index 4990e62..94c62d3 100644
--- a/applications/inference_process/src/inference_process.cpp
+++ b/applications/inference_process/src/inference_process.cpp
@@ -80,10 +80,10 @@ private:
};
void print_output_data(TfLiteTensor *output, size_t bytesToPrint) {
- constexpr auto crc = Crc();
+ constexpr auto crc = Crc();
const uint32_t output_crc32 = crc.crc32(output->data.data, output->bytes);
- const int numBytesToPrint = min(output->bytes, bytesToPrint);
- int dims_size = output->dims->size;
+ const int numBytesToPrint = min(output->bytes, bytesToPrint);
+ int dims_size = output->dims->size;
LOG("{\n");
LOG("\"dims\": [%d,", dims_size);
for (int i = 0; i < output->dims->size - 1; ++i) {
@@ -91,15 +91,14 @@ void print_output_data(TfLiteTensor *output, size_t bytesToPrint) {
}
LOG("%d],\n", output->dims->data[dims_size - 1]);
LOG("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data);
- if (numBytesToPrint)
- {
+ if (numBytesToPrint) {
LOG("\"crc32\": \"%08" PRIx32 "\",\n", output_crc32);
LOG("\"data\":\"");
for (int i = 0; i < numBytesToPrint - 1; ++i) {
/*
- * Workaround an issue when compiling with GCC where by
- * printing only a '\n' the produced global output is wrong.
- */
+ * Workaround an issue when compiling with GCC where by
+ * printing only a '\n' the produced global output is wrong.
+ */
if (i % 15 == 0 && i != 0) {
LOG("0x%02x,\n", output->data.uint8[i]);
} else {
@@ -107,8 +106,7 @@ void print_output_data(TfLiteTensor *output, size_t bytesToPrint) {
}
}
LOG("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]);
- }
- else {
+ } else {
LOG("\"crc32\": \"%08" PRIx32 "\"\n", output_crc32);
}
LOG("}");
@@ -156,7 +154,7 @@ InferenceJob::InferenceJob(const string &_name,
const vector<DataPtr> &_expectedOutput,
size_t _numBytesToPrint,
const vector<uint8_t> &_pmuEventConfig,
- const uint32_t _pmuCycleCounterEnable) :
+ const bool _pmuCycleCounterEnable) :
name(_name),
networkModel(_networkModel), input(_input), output(_output), expectedOutput(_expectedOutput),
numBytesToPrint(_numBytesToPrint), pmuEventConfig(_pmuEventConfig), pmuCycleCounterEnable(_pmuCycleCounterEnable),
@@ -250,7 +248,7 @@ bool InferenceProcess::runJob(InferenceJob &job) {
// Create the TFL micro interpreter
tflite::AllOpsResolver resolver;
#ifdef LAYER_BY_LAYER_PROFILER
- tflite::LayerByLayerProfiler profiler;
+ tflite::LayerByLayerProfiler profiler(job.pmuEventConfig, job.pmuCycleCounterEnable);
#else
tflite::ArmProfiler profiler;
#endif
@@ -308,9 +306,12 @@ bool InferenceProcess::runJob(InferenceJob &job) {
LOG("Inference runtime: %u cycles\n", (unsigned int)profiler.GetTotalTicks());
- if (job.pmuCycleCounterEnable != 0) {
- job.pmuCycleCounterCount = profiler.GetTotalTicks();
+#ifdef LAYER_BY_LAYER_PROFILER
+ if (job.pmuCycleCounterEnable) {
+ job.pmuCycleCounterCount = profiler.GetPmuCycleCounterCount();
}
+ job.pmuEventCount.assign(profiler.GetPmuEventCount().begin(), profiler.GetPmuEventCount().end());
+#endif
// Copy output data
if (job.output.size() > 0) {
diff --git a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
index 8e8dc0f..37bd868 100644
--- a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
+++ b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
@@ -23,6 +23,7 @@
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include <memory>
#include <pmu_ethosu.h>
+#include <vector>
// NOTE: This profiler only works on systems with 1 NPU due to the use of
// ethosu_reserve_driver().
@@ -30,22 +31,32 @@ namespace tflite {
class LayerByLayerProfiler : public MicroProfiler {
public:
enum Backend { PRINTF, EVENT_RECORDER };
- LayerByLayerProfiler(size_t max_events = 200,
- Backend backend = PRINTF,
- int32_t event_id = EventID(EventLevelError, EvtStatistics_No, EventRecordNone));
+ LayerByLayerProfiler(const std::vector<uint8_t> &event_config = {},
+ bool pmu_cycle_counter_enable = true,
+ size_t max_events = 200,
+ Backend backend = PRINTF,
+ int32_t event_id = EventID(EventLevelError, EvtStatistics_No, EventRecordNone));
uint32_t BeginEvent(const char *tag);
void EndEvent(uint32_t event_handle);
uint64_t GetTotalTicks() const;
void Log() const;
+ uint64_t GetPmuCycleCounterCount() const;
+ const std::vector<uint32_t> &GetPmuEventCount() const;
+
private:
- size_t max_events_;
std::unique_ptr<const char *[]> tags_;
std::unique_ptr<uint64_t[]> start_ticks_;
std::unique_ptr<uint64_t[]> end_ticks_;
- Backend backend_;
- int32_t event_id_;
+ std::vector<uint8_t> pmu_event_config;
+ std::vector<uint32_t> pmu_event_count;
+ bool pmu_cycle_counter_enable;
+ uint64_t pmu_cycle_counter_count;
+
+ size_t max_events_;
+ Backend backend;
+ int32_t event_id;
size_t num_events_;
TF_LITE_REMOVE_VIRTUAL_DELETE;
diff --git a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
index 0211414..a5b8e5a 100644
--- a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
+++ b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
@@ -39,17 +39,18 @@ uint64_t GetCurrentEthosuTicks(struct ethosu_driver *drv) {
namespace tflite {
-LayerByLayerProfiler::LayerByLayerProfiler(size_t max_events, Backend backend, int32_t event_id) :
- max_events_(max_events), backend_(backend), event_id_(event_id), num_events_(0) {
-
- tags_ = std::make_unique<const char *[]>(max_events_);
- start_ticks_ = std::make_unique<uint64_t[]>(max_events_);
- end_ticks_ = std::make_unique<uint64_t[]>(max_events_);
-
- struct ethosu_driver *drv = ethosu_reserve_driver();
- ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
- ETHOSU_PMU_CYCCNT_Reset(drv);
- ethosu_release_driver(drv);
+LayerByLayerProfiler::LayerByLayerProfiler(const std::vector<uint8_t> &event_config,
+ bool pmu_cycle_counter_enable,
+ size_t max_events,
+ Backend backend,
+ int32_t event_id) :
+ pmu_event_config(event_config),
+ pmu_event_count(), pmu_cycle_counter_enable(pmu_cycle_counter_enable), pmu_cycle_counter_count(0),
+ max_events_(max_events), backend(backend), event_id(event_id), num_events_(0) {
+
+ tags_ = std::make_unique<const char *[]>(max_events);
+ start_ticks_ = std::make_unique<uint64_t[]>(max_events);
+ end_ticks_ = std::make_unique<uint64_t[]>(max_events);
}
// NOTE: THIS PROFILER ONLY WORKS ON SYSTEMS WITH 1 NPU
@@ -62,17 +63,44 @@ uint32_t LayerByLayerProfiler::BeginEvent(const char *tag) {
tags_[num_events_] = tag;
if (strcmp("ethos-u", tag) == 0) {
- struct ethosu_driver *ethosu_drv = ethosu_reserve_driver();
- ETHOSU_PMU_CYCCNT_Reset(ethosu_drv);
- ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(ethosu_drv, ETHOSU_PMU_NPU_ACTIVE);
- ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(ethosu_drv, ETHOSU_PMU_NPU_IDLE);
- start_ticks_[num_events_] = GetCurrentEthosuTicks(ethosu_drv);
- ethosu_release_driver(ethosu_drv);
+ struct ethosu_driver *drv = ethosu_reserve_driver();
+ size_t numEventCounters = ETHOSU_PMU_Get_NumEventCounters();
+
+ if (pmu_event_config.size() > numEventCounters) {
+ LOG_WARN("PMU event config list is bigger (%lu) than available PMU event counters (%lu)",
+ pmu_event_config.size(),
+ numEventCounters);
+ LOG_WARN("PMU event config list will be truncated");
+ pmu_event_config.resize(numEventCounters);
+ }
+ // Enable PMU
+ ETHOSU_PMU_Enable(drv);
+
+ for (size_t i = 0; i < pmu_event_config.size(); i++) {
+ ETHOSU_PMU_Set_EVTYPER(drv, i, static_cast<ethosu_pmu_event_type>(pmu_event_config[i]));
+ }
+
+ ETHOSU_PMU_CNTR_Enable(drv, (1 << pmu_event_config.size()) - 1);
+ ETHOSU_PMU_EVCNTR_ALL_Reset(drv);
+
+ // Configure the cycle counter
+ if (pmu_cycle_counter_enable) {
+ ETHOSU_PMU_CNTR_Disable(drv, ETHOSU_PMU_CCNT_Msk);
+ ETHOSU_PMU_CYCCNT_Reset(drv);
+
+ ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE);
+ ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE);
+
+ ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
+ }
+ start_ticks_[num_events_] = 0; // Hardware cycle counter has been reset above, thus starts at 0
+ ethosu_release_driver(drv);
} else {
start_ticks_[num_events_] = GetCurrentTimeTicks();
}
- end_ticks_[num_events_] = start_ticks_[num_events_] - 1;
+ end_ticks_[num_events_] =
+ start_ticks_[num_events_]; // NOTE: In case an EndEvent() doesn't trigger, cycles reports as 0
return num_events_++;
}
@@ -81,19 +109,42 @@ void LayerByLayerProfiler::EndEvent(uint32_t event_handle) {
TFLITE_DCHECK(event_handle < max_events_);
if (strcmp("ethos-u", tags_[event_handle]) == 0) {
- struct ethosu_driver *ethosu_drv = ethosu_reserve_driver();
- end_ticks_[event_handle] = GetCurrentEthosuTicks(ethosu_drv);
- ethosu_release_driver(ethosu_drv);
+ struct ethosu_driver *drv = ethosu_reserve_driver();
+
+ end_ticks_[event_handle] = GetCurrentEthosuTicks(drv);
+ // Get the cycle count
+ if (pmu_cycle_counter_enable) {
+ pmu_cycle_counter_count = end_ticks_[event_handle];
+ }
+
+ // Save the PMU counter values
+ // NOTE: If multiple ethos-u layers, only the latest will be saved
+ pmu_event_count.resize(pmu_event_config.size());
+ for (size_t i = 0; i < pmu_event_config.size(); i++) {
+ pmu_event_count[i] = ETHOSU_PMU_Get_EVCNTR(drv, i);
+ }
+
+ // Shut down the PMU
+ ETHOSU_PMU_Disable(drv);
+
+ ethosu_release_driver(drv);
} else {
end_ticks_[event_handle] = GetCurrentTimeTicks();
}
- if (backend_ == PRINTF) {
- LOG("%s : cycle_cnt : %" PRIu64 " cycles\n",
- tags_[event_handle],
- end_ticks_[event_handle] - start_ticks_[event_handle]);
+ if (backend == PRINTF) {
+ if (strcmp("ethos-u", tags_[event_handle]) == 0) {
+ for (size_t i = 0; i < pmu_event_count.size(); i++) {
+ LOG("ethos-u : ethosu_pmu_cntr%lu : %u\n", i, pmu_event_count[i]);
+ }
+ LOG("ethos-u : cycle_cnt : %" PRIu64 " cycles\n", pmu_cycle_counter_count);
+ } else {
+ LOG("%s : cycle_cnt : %" PRIu64 " cycles\n",
+ tags_[event_handle],
+ end_ticks_[event_handle] - start_ticks_[event_handle]);
+ }
} else {
- EventRecord2(event_id_, (int32_t)event_handle, end_ticks_[event_handle] - start_ticks_[event_handle]);
+ EventRecord2(event_id, (int32_t)event_handle, end_ticks_[event_handle] - start_ticks_[event_handle]);
}
}
@@ -107,10 +158,18 @@ uint64_t LayerByLayerProfiler::GetTotalTicks() const {
return ticks;
}
+uint64_t LayerByLayerProfiler::GetPmuCycleCounterCount() const {
+ return pmu_cycle_counter_count;
+}
+
+const std::vector<uint32_t> &LayerByLayerProfiler::GetPmuEventCount() const {
+ return pmu_event_count;
+}
+
void LayerByLayerProfiler::Log() const {
#if !defined(TF_LITE_STRIP_ERROR_STRINGS)
- if (backend_ == PRINTF) {
+ if (backend == PRINTF) {
for (size_t i = 0; i < num_events_; ++i) {
uint64_t ticks = end_ticks_[i] - start_ticks_[i];
LOG("%s took %" PRIu64 " cycles", tags_[i], ticks);