From ffe845d4aad5a389e295bab5f78c8088ef187ea0 Mon Sep 17 00:00:00 2001 From: Bhavik Patel Date: Mon, 16 Nov 2020 12:13:56 +0100 Subject: MLBEDSW-3269 Add pmu counters for inference_process Change-Id: I6594acc228fe8048f56cb96ac7846a87491eed38 --- .../include/inference_process.hpp | 9 +++- .../inference_process/src/inference_process.cc | 48 ++++++++++++++++++---- .../message_process/include/message_process.hpp | 8 +++- .../message_process/src/message_process.cc | 40 ++++++++++++++++-- 4 files changed, 92 insertions(+), 13 deletions(-) diff --git a/applications/inference_process/include/inference_process.hpp b/applications/inference_process/include/inference_process.hpp index ec682d1..67b30c5 100644 --- a/applications/inference_process/include/inference_process.hpp +++ b/applications/inference_process/include/inference_process.hpp @@ -18,6 +18,7 @@ #pragma once +#include #include #include #include @@ -41,6 +42,10 @@ struct InferenceJob { std::vector output; std::vector expectedOutput; size_t numBytesToPrint; + std::vector pmuEventConfig; + uint32_t pmuCycleCounterEnable; + std::vector pmuEventCount; + uint64_t pmuCycleCounterCount; InferenceJob(); InferenceJob(const std::string &name, @@ -48,7 +53,9 @@ struct InferenceJob { const std::vector &input, const std::vector &output, const std::vector &expectedOutput, - size_t numBytesToPrint); + size_t numBytesToPrint, + const std::vector &pmuEventConfig, + const uint32_t pmuCycleCounterEnable); void invalidate(); void clean(); diff --git a/applications/inference_process/src/inference_process.cc b/applications/inference_process/src/inference_process.cc index ecf6144..f3d2da8 100644 --- a/applications/inference_process/src/inference_process.cc +++ b/applications/inference_process/src/inference_process.cc @@ -20,6 +20,7 @@ #include "tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h" #include "tensorflow/lite/micro/micro_error_reporter.h" #include "tensorflow/lite/micro/micro_interpreter.h" +#include "tensorflow/lite/micro/micro_profiler.h" #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/version.h" @@ -106,10 +107,17 @@ InferenceJob::InferenceJob(const string &_name, const vector &_input, const vector &_output, const vector &_expectedOutput, - size_t _numBytesToPrint) : + size_t _numBytesToPrint, + const vector &_pmuEventConfig, + const uint32_t pmuCycleCounterEnable) : name(_name), networkModel(_networkModel), input(_input), output(_output), expectedOutput(_expectedOutput), - numBytesToPrint(_numBytesToPrint) {} + numBytesToPrint(_numBytesToPrint), pmuEventConfig(_pmuEventConfig), pmuCycleCounterEnable(pmuCycleCounterEnable), + pmuEventCount(), pmuCycleCounterCount(0) { +#if defined(INFERENCE_PROC_TFLU_PROFILER) && defined(ETHOSU) + pmuEventCount = vector(ETHOSU_PMU_NCOUNTERS, 0); +#endif +} void InferenceJob::invalidate() { networkModel.invalidate(); @@ -183,6 +191,9 @@ bool InferenceProcess::push(const InferenceJob &job) { bool InferenceProcess::runJob(InferenceJob &job) { printf("Running inference job: %s\n", job.name.c_str()); + // Register debug log callback for profiling + RegisterDebugLogCallback(tflu_debug_log); + tflite::MicroErrorReporter microErrorReporter; tflite::ErrorReporter *reporter = µErrorReporter; @@ -197,7 +208,17 @@ bool InferenceProcess::runJob(InferenceJob &job) { // Create the TFL micro interpreter tflite::AllOpsResolver resolver; - tflite::MicroInterpreter interpreter(model, resolver, inferenceProcessTensorArena, TENSOR_ARENA_SIZE, reporter); + tflite::MicroProfiler profiler(reporter); + +#if defined(INFERENCE_PROC_TFLU_PROFILER) && defined(ETHOSU) + profiler.MonitorEthosuPMUEvents(ethosu_pmu_event_type(job.pmuEventConfig[0]), + ethosu_pmu_event_type(job.pmuEventConfig[1]), + ethosu_pmu_event_type(job.pmuEventConfig[2]), + ethosu_pmu_event_type(job.pmuEventConfig[3])); +#endif + + tflite::MicroInterpreter interpreter( + model, resolver, inferenceProcessTensorArena, TENSOR_ARENA_SIZE, reporter, &profiler); // Allocate tensors TfLiteStatus allocate_status = interpreter.AllocateTensors(); @@ -240,9 +261,6 @@ bool InferenceProcess::runJob(InferenceJob &job) { copy(static_cast(input.data), static_cast(input.data) + input.size, tensor->data.uint8); } - // Register debug log callback for profiling - RegisterDebugLogCallback(tflu_debug_log); - // Run the inference TfLiteStatus invoke_status = interpreter.Invoke(); if (invoke_status != kTfLiteOk) { @@ -250,6 +268,22 @@ bool InferenceProcess::runJob(InferenceJob &job) { return true; } + printf("%s : %zu\r\n", "arena_used_bytes", interpreter.arena_used_bytes()); + +#ifdef INFERENCE_PROC_TFLU_PROFILER + printf("Inference runtime: %u cycles\r\n", (unsigned int)profiler.TotalInferenceTime()); + + if (job.pmuCycleCounterEnable != 0) { + job.pmuCycleCounterCount = profiler.TotalInferenceTime(); + } + +#ifdef ETHOSU + for (uint32_t i = 0; i < ETHOSU_PMU_NCOUNTERS; i++) { + job.pmuEventCount[i] = profiler.GetEthosuPMUCounter(i); + } +#endif +#endif + // Copy output data if (job.output.size() > 0) { if (interpreter.outputs_size() != job.output.size()) { @@ -285,7 +319,7 @@ bool InferenceProcess::runJob(InferenceJob &job) { if (job.expectedOutput.size() > 0) { if (job.expectedOutput.size() != interpreter.outputs_size()) { - printf("Expeded number of output tensors does not match network. job=%s, expected=%zu, network=%zu\n", + printf("Expected number of output tensors does not match network. job=%s, expected=%zu, network=%zu\n", job.name.c_str(), job.expectedOutput.size(), interpreter.outputs_size()); diff --git a/applications/message_process/include/message_process.hpp b/applications/message_process/include/message_process.hpp index 602c2a4..96da30e 100644 --- a/applications/message_process/include/message_process.hpp +++ b/applications/message_process/include/message_process.hpp @@ -96,7 +96,13 @@ public: void run(); bool handleMessage(); void sendPong(); - void sendInferenceRsp(uint64_t userArg, std::vector &ofm, bool failed); + void sendInferenceRsp(uint64_t userArg, + std::vector &ofm, + bool failed, + std::vector &pmuEventConfig, + uint32_t pmuCycleCounterEnable, + std::vector &pmuEventCount, + uint64_t pmuCycleCounterCount); private: QueueImpl queueIn; diff --git a/applications/message_process/src/message_process.cc b/applications/message_process/src/message_process.cc index c890399..8a8eec7 100644 --- a/applications/message_process/src/message_process.cc +++ b/applications/message_process/src/message_process.cc @@ -240,13 +240,25 @@ bool MessageProcess::handleMessage() { vector expectedOutput; - InferenceJob job("job", networkModel, ifm, ofm, expectedOutput, -1); + vector pmuEventConfig; + for (uint32_t i = 0; i < ETHOSU_CORE_PMU_MAX; i++) { + pmuEventConfig[i] = req.pmu_event_config[i]; + } + + InferenceJob job( + "job", networkModel, ifm, ofm, expectedOutput, -1, pmuEventConfig, req.pmu_cycle_counter_enable); job.invalidate(); bool failed = inferenceProcess.runJob(job); job.clean(); - sendInferenceRsp(req.user_arg, job.output, failed); + sendInferenceRsp(req.user_arg, + job.output, + failed, + job.pmuEventConfig, + job.pmuCycleCounterEnable, + job.pmuEventCount, + job.pmuCycleCounterCount); break; } default: { @@ -266,8 +278,19 @@ void MessageProcess::sendPong() { mailbox.sendMessage(); } -void MessageProcess::sendInferenceRsp(uint64_t userArg, vector &ofm, bool failed) { - ethosu_core_inference_rsp rsp; +void MessageProcess::sendInferenceRsp(uint64_t userArg, + vector &ofm, + bool failed, + vector &pmuEventConfig, + uint32_t pmuCycleCounterEnable, + vector &pmuEventCount, + uint64_t pmuCycleCounterCount) { + ethosu_core_inference_rsp rsp = { + .pmu_event_count = + { + 0, + }, + }; rsp.user_arg = userArg; rsp.ofm_count = ofm.size(); @@ -277,6 +300,15 @@ void MessageProcess::sendInferenceRsp(uint64_t userArg, vector &ofm, bo rsp.ofm_size[i] = ofm[i].size; } + for (size_t i = 0; i < pmuEventConfig.size(); i++) { + rsp.pmu_event_config[i] = pmuEventConfig[i]; + } + rsp.pmu_cycle_counter_enable = pmuCycleCounterEnable; + for (size_t i = 0; i < pmuEventCount.size(); i++) { + rsp.pmu_event_count[i] = pmuEventCount[i]; + } + rsp.pmu_cycle_counter_count = pmuCycleCounterCount; + printf("Sending inference response. userArg=0x%" PRIx64 ", ofm_count=%" PRIu32 ", status=%" PRIu32 "\n", rsp.user_arg, rsp.ofm_count, -- cgit v1.2.1