diff options
Diffstat (limited to 'applications')
4 files changed, 92 insertions, 13 deletions
diff --git a/applications/inference_process/include/inference_process.hpp b/applications/inference_process/include/inference_process.hpp index ec682d1..67b30c5 100644 --- a/applications/inference_process/include/inference_process.hpp +++ b/applications/inference_process/include/inference_process.hpp @@ -18,6 +18,7 @@ #pragma once +#include <array> #include <queue> #include <stdlib.h> #include <string> @@ -41,6 +42,10 @@ struct InferenceJob { std::vector<DataPtr> output; std::vector<DataPtr> expectedOutput; size_t numBytesToPrint; + std::vector<uint8_t> pmuEventConfig; + uint32_t pmuCycleCounterEnable; + std::vector<uint32_t> pmuEventCount; + uint64_t pmuCycleCounterCount; InferenceJob(); InferenceJob(const std::string &name, @@ -48,7 +53,9 @@ struct InferenceJob { const std::vector<DataPtr> &input, const std::vector<DataPtr> &output, const std::vector<DataPtr> &expectedOutput, - size_t numBytesToPrint); + size_t numBytesToPrint, + const std::vector<uint8_t> &pmuEventConfig, + const uint32_t pmuCycleCounterEnable); void invalidate(); void clean(); diff --git a/applications/inference_process/src/inference_process.cc b/applications/inference_process/src/inference_process.cc index ecf6144..f3d2da8 100644 --- a/applications/inference_process/src/inference_process.cc +++ b/applications/inference_process/src/inference_process.cc @@ -20,6 +20,7 @@ #include "tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h" #include "tensorflow/lite/micro/micro_error_reporter.h" #include "tensorflow/lite/micro/micro_interpreter.h" +#include "tensorflow/lite/micro/micro_profiler.h" #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/version.h" @@ -106,10 +107,17 @@ InferenceJob::InferenceJob(const string &_name, const vector<DataPtr> &_input, const vector<DataPtr> &_output, const vector<DataPtr> &_expectedOutput, - size_t _numBytesToPrint) : + size_t _numBytesToPrint, + const vector<uint8_t> &_pmuEventConfig, + const uint32_t pmuCycleCounterEnable) : name(_name), networkModel(_networkModel), input(_input), output(_output), expectedOutput(_expectedOutput), - numBytesToPrint(_numBytesToPrint) {} + numBytesToPrint(_numBytesToPrint), pmuEventConfig(_pmuEventConfig), pmuCycleCounterEnable(pmuCycleCounterEnable), + pmuEventCount(), pmuCycleCounterCount(0) { +#if defined(INFERENCE_PROC_TFLU_PROFILER) && defined(ETHOSU) + pmuEventCount = vector<uint32_t>(ETHOSU_PMU_NCOUNTERS, 0); +#endif +} void InferenceJob::invalidate() { networkModel.invalidate(); @@ -183,6 +191,9 @@ bool InferenceProcess::push(const InferenceJob &job) { bool InferenceProcess::runJob(InferenceJob &job) { printf("Running inference job: %s\n", job.name.c_str()); + // Register debug log callback for profiling + RegisterDebugLogCallback(tflu_debug_log); + tflite::MicroErrorReporter microErrorReporter; tflite::ErrorReporter *reporter = µErrorReporter; @@ -197,7 +208,17 @@ bool InferenceProcess::runJob(InferenceJob &job) { // Create the TFL micro interpreter tflite::AllOpsResolver resolver; - tflite::MicroInterpreter interpreter(model, resolver, inferenceProcessTensorArena, TENSOR_ARENA_SIZE, reporter); + tflite::MicroProfiler profiler(reporter); + +#if defined(INFERENCE_PROC_TFLU_PROFILER) && defined(ETHOSU) + profiler.MonitorEthosuPMUEvents(ethosu_pmu_event_type(job.pmuEventConfig[0]), + ethosu_pmu_event_type(job.pmuEventConfig[1]), + ethosu_pmu_event_type(job.pmuEventConfig[2]), + ethosu_pmu_event_type(job.pmuEventConfig[3])); +#endif + + tflite::MicroInterpreter interpreter( + model, resolver, inferenceProcessTensorArena, TENSOR_ARENA_SIZE, reporter, &profiler); // Allocate tensors TfLiteStatus allocate_status = interpreter.AllocateTensors(); @@ -240,9 +261,6 @@ bool InferenceProcess::runJob(InferenceJob &job) { copy(static_cast<char *>(input.data), static_cast<char *>(input.data) + input.size, tensor->data.uint8); } - // Register debug log callback for profiling - RegisterDebugLogCallback(tflu_debug_log); - // Run the inference TfLiteStatus invoke_status = interpreter.Invoke(); if (invoke_status != kTfLiteOk) { @@ -250,6 +268,22 @@ bool InferenceProcess::runJob(InferenceJob &job) { return true; } + printf("%s : %zu\r\n", "arena_used_bytes", interpreter.arena_used_bytes()); + +#ifdef INFERENCE_PROC_TFLU_PROFILER + printf("Inference runtime: %u cycles\r\n", (unsigned int)profiler.TotalInferenceTime()); + + if (job.pmuCycleCounterEnable != 0) { + job.pmuCycleCounterCount = profiler.TotalInferenceTime(); + } + +#ifdef ETHOSU + for (uint32_t i = 0; i < ETHOSU_PMU_NCOUNTERS; i++) { + job.pmuEventCount[i] = profiler.GetEthosuPMUCounter(i); + } +#endif +#endif + // Copy output data if (job.output.size() > 0) { if (interpreter.outputs_size() != job.output.size()) { @@ -285,7 +319,7 @@ bool InferenceProcess::runJob(InferenceJob &job) { if (job.expectedOutput.size() > 0) { if (job.expectedOutput.size() != interpreter.outputs_size()) { - printf("Expeded number of output tensors does not match network. job=%s, expected=%zu, network=%zu\n", + printf("Expected number of output tensors does not match network. job=%s, expected=%zu, network=%zu\n", job.name.c_str(), job.expectedOutput.size(), interpreter.outputs_size()); diff --git a/applications/message_process/include/message_process.hpp b/applications/message_process/include/message_process.hpp index 602c2a4..96da30e 100644 --- a/applications/message_process/include/message_process.hpp +++ b/applications/message_process/include/message_process.hpp @@ -96,7 +96,13 @@ public: void run(); bool handleMessage(); void sendPong(); - void sendInferenceRsp(uint64_t userArg, std::vector<InferenceProcess::DataPtr> &ofm, bool failed); + void sendInferenceRsp(uint64_t userArg, + std::vector<InferenceProcess::DataPtr> &ofm, + bool failed, + std::vector<uint8_t> &pmuEventConfig, + uint32_t pmuCycleCounterEnable, + std::vector<uint32_t> &pmuEventCount, + uint64_t pmuCycleCounterCount); private: QueueImpl queueIn; diff --git a/applications/message_process/src/message_process.cc b/applications/message_process/src/message_process.cc index c890399..8a8eec7 100644 --- a/applications/message_process/src/message_process.cc +++ b/applications/message_process/src/message_process.cc @@ -240,13 +240,25 @@ bool MessageProcess::handleMessage() { vector<DataPtr> expectedOutput; - InferenceJob job("job", networkModel, ifm, ofm, expectedOutput, -1); + vector<uint8_t> pmuEventConfig; + for (uint32_t i = 0; i < ETHOSU_CORE_PMU_MAX; i++) { + pmuEventConfig[i] = req.pmu_event_config[i]; + } + + InferenceJob job( + "job", networkModel, ifm, ofm, expectedOutput, -1, pmuEventConfig, req.pmu_cycle_counter_enable); job.invalidate(); bool failed = inferenceProcess.runJob(job); job.clean(); - sendInferenceRsp(req.user_arg, job.output, failed); + sendInferenceRsp(req.user_arg, + job.output, + failed, + job.pmuEventConfig, + job.pmuCycleCounterEnable, + job.pmuEventCount, + job.pmuCycleCounterCount); break; } default: { @@ -266,8 +278,19 @@ void MessageProcess::sendPong() { mailbox.sendMessage(); } -void MessageProcess::sendInferenceRsp(uint64_t userArg, vector<DataPtr> &ofm, bool failed) { - ethosu_core_inference_rsp rsp; +void MessageProcess::sendInferenceRsp(uint64_t userArg, + vector<DataPtr> &ofm, + bool failed, + vector<uint8_t> &pmuEventConfig, + uint32_t pmuCycleCounterEnable, + vector<uint32_t> &pmuEventCount, + uint64_t pmuCycleCounterCount) { + ethosu_core_inference_rsp rsp = { + .pmu_event_count = + { + 0, + }, + }; rsp.user_arg = userArg; rsp.ofm_count = ofm.size(); @@ -277,6 +300,15 @@ void MessageProcess::sendInferenceRsp(uint64_t userArg, vector<DataPtr> &ofm, bo rsp.ofm_size[i] = ofm[i].size; } + for (size_t i = 0; i < pmuEventConfig.size(); i++) { + rsp.pmu_event_config[i] = pmuEventConfig[i]; + } + rsp.pmu_cycle_counter_enable = pmuCycleCounterEnable; + for (size_t i = 0; i < pmuEventCount.size(); i++) { + rsp.pmu_event_count[i] = pmuEventCount[i]; + } + rsp.pmu_cycle_counter_count = pmuCycleCounterCount; + printf("Sending inference response. userArg=0x%" PRIx64 ", ofm_count=%" PRIu32 ", status=%" PRIu32 "\n", rsp.user_arg, rsp.ofm_count, |