From dcc1ce0f9b64198f3bb82d9858ec79663808a35f Mon Sep 17 00:00:00 2001 From: Kristofer Jonsson Date: Tue, 21 Dec 2021 16:25:19 +0100 Subject: Inference process update Split runJob() into smaller functions to make the code easier to read and maintain. Correct return type of Layer By Layer profiler GetTotalTicks(). Change-Id: Ie414522017b3b6c6da9a09182439dbe4b2efdb1f --- .../include/inference_process.hpp | 18 ++ .../inference_process/src/inference_process.cpp | 279 ++++++++++++--------- .../include/layer_by_layer_profiler.hpp | 2 +- .../src/layer_by_layer_profiler.cpp | 20 +- 4 files changed, 193 insertions(+), 126 deletions(-) diff --git a/applications/inference_process/include/inference_process.hpp b/applications/inference_process/include/inference_process.hpp index b8b2775..6ab453c 100644 --- a/applications/inference_process/include/inference_process.hpp +++ b/applications/inference_process/include/inference_process.hpp @@ -24,6 +24,14 @@ #include #include +struct TfLiteTensor; + +namespace tflite { +// Forward declarations +class MicroInterpreter; +class MicroResourceVariables; +} // namespace tflite + namespace InferenceProcess { struct DataPtr { void *data; @@ -33,6 +41,9 @@ struct DataPtr { void invalidate(); void clean(); + + char *begin() const; + char *end() const; }; struct InferenceJob { @@ -68,6 +79,13 @@ public: bool runJob(InferenceJob &job); private: + static bool copyIfm(InferenceJob &job, tflite::MicroInterpreter &interpreter); + static bool copyOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter); + static bool compareOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter); + static void printJob(InferenceJob &job, tflite::MicroInterpreter &interpreter); + static void printOutputTensor(TfLiteTensor *output, size_t bytesToPrint); + static void tfluDebugLog(const char *s); + uint8_t *tensorArena; const size_t tensorArenaSize; }; diff --git a/applications/inference_process/src/inference_process.cpp b/applications/inference_process/src/inference_process.cpp index 499cdd3..ebd9d6c 100644 --- a/applications/inference_process/src/inference_process.cpp +++ b/applications/inference_process/src/inference_process.cpp @@ -39,10 +39,6 @@ using namespace std; namespace { -void tflu_debug_log(const char *s) { - LOG("%s", s); -} - class Crc { public: constexpr Crc() : table() { @@ -79,55 +75,6 @@ private: uint32_t table[256]; }; -void print_output_data(TfLiteTensor *output, size_t bytesToPrint) { - constexpr auto crc = Crc(); - const uint32_t output_crc32 = crc.crc32(output->data.data, output->bytes); - const int numBytesToPrint = min(output->bytes, bytesToPrint); - int dims_size = output->dims->size; - LOG("{\n"); - LOG("\"dims\": [%d,", dims_size); - for (int i = 0; i < output->dims->size - 1; ++i) { - LOG("%d,", output->dims->data[i]); - } - LOG("%d],\n", output->dims->data[dims_size - 1]); - LOG("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data); - if (numBytesToPrint) { - LOG("\"crc32\": \"%08" PRIx32 "\",\n", output_crc32); - LOG("\"data\":\""); - for (int i = 0; i < numBytesToPrint - 1; ++i) { - /* - * Workaround an issue when compiling with GCC where by - * printing only a '\n' the produced global output is wrong. - */ - if (i % 15 == 0 && i != 0) { - LOG("0x%02x,\n", output->data.uint8[i]); - } else { - LOG("0x%02x,", output->data.uint8[i]); - } - } - LOG("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]); - } else { - LOG("\"crc32\": \"%08" PRIx32 "\"\n", output_crc32); - } - LOG("}"); -} - -bool copyOutput(const TfLiteTensor &src, InferenceProcess::DataPtr &dst) { - if (dst.data == nullptr) { - return false; - } - - if (src.bytes > dst.size) { - LOG_ERR("Tensor size mismatch (bytes): actual=%d, expected%d.", src.bytes, dst.size); - return true; - } - - copy(src.data.uint8, src.data.uint8 + src.bytes, static_cast(dst.data)); - dst.size = src.bytes; - - return false; -} - } // namespace namespace InferenceProcess { @@ -145,6 +92,14 @@ void DataPtr::clean() { #endif } +char *DataPtr::begin() const { + return static_cast(data); +} + +char *DataPtr::end() const { + return static_cast(data) + size; +} + InferenceJob::InferenceJob() : numBytesToPrint(0) {} InferenceJob::InferenceJob(const string &_name, @@ -199,10 +154,7 @@ bool InferenceProcess::runJob(InferenceJob &job) { LOG_INFO("Running inference job: %s", job.name.c_str()); // Register debug log callback for profiling - RegisterDebugLogCallback(tflu_debug_log); - - tflite::MicroErrorReporter microErrorReporter; - tflite::ErrorReporter *reporter = µErrorReporter; + RegisterDebugLogCallback(tfluDebugLog); // Get model handle and verify that the version is correct const tflite::Model *model = ::tflite::GetModel(job.networkModel.data); @@ -221,15 +173,57 @@ bool InferenceProcess::runJob(InferenceJob &job) { tflite::ArmProfiler profiler; #endif - tflite::MicroInterpreter interpreter(model, resolver, tensorArena, tensorArenaSize, reporter, nullptr, &profiler); + tflite::MicroErrorReporter errorReporter; + tflite::MicroInterpreter interpreter( + model, resolver, tensorArena, tensorArenaSize, &errorReporter, nullptr, &profiler); // Allocate tensors - TfLiteStatus allocate_status = interpreter.AllocateTensors(); - if (allocate_status != kTfLiteOk) { + TfLiteStatus status = interpreter.AllocateTensors(); + if (status != kTfLiteOk) { LOG_ERR("Failed to allocate tensors for inference: job=%s", job.name.c_str()); return true; } + // Copy IFM data from job descriptor to TFLu arena + if (copyIfm(job, interpreter)) { + return true; + } + + // Run the inference + status = interpreter.Invoke(); + if (status != kTfLiteOk) { + LOG_ERR("Invoke failed for inference: job=%s", job.name.c_str()); + return true; + } + +#ifdef LAYER_BY_LAYER_PROFILER + if (job.pmuCycleCounterEnable) { + job.pmuCycleCounterCount = profiler.GetPmuCycleCounterCount(); + } + + job.pmuEventCount.assign(profiler.GetPmuEventCount().begin(), profiler.GetPmuEventCount().end()); +#endif + + LOG("Inference runtime: %" PRId32 " cycles\n", profiler.GetTotalTicks()); + + // Copy output data from TFLu arena to job descriptor + if (copyOfm(job, interpreter)) { + return true; + } + + printJob(job, interpreter); + + // Compare the OFM with the expected reference data + if (compareOfm(job, interpreter)) { + return true; + } + + LOG_INFO("Finished running job: %s", job.name.c_str()); + + return false; +} + +bool InferenceProcess::copyIfm(InferenceJob &job, tflite::MicroInterpreter &interpreter) { // Create a filtered list of non empty input tensors vector inputTensors; for (size_t i = 0; i < interpreter.inputs_size(); ++i) { @@ -239,6 +233,7 @@ bool InferenceProcess::runJob(InferenceJob &job) { inputTensors.push_back(tensor); } } + if (job.input.size() != inputTensors.size()) { LOG_ERR("Number of input buffers does not match number of non empty network tensors: input=%zu, network=%zu", job.input.size(), @@ -246,10 +241,10 @@ bool InferenceProcess::runJob(InferenceJob &job) { return true; } - // Copy input data + // Copy input data from job to TFLu arena for (size_t i = 0; i < inputTensors.size(); ++i) { - const DataPtr &input = job.input[i]; - const TfLiteTensor *tensor = inputTensors[i]; + DataPtr &input = job.input[i]; + TfLiteTensor *tensor = inputTensors[i]; if (input.size != tensor->bytes) { LOG_ERR("Job input size does not match network input size: job=%s, index=%zu, input=%zu, network=%u", @@ -260,96 +255,150 @@ bool InferenceProcess::runJob(InferenceJob &job) { return true; } - copy(static_cast(input.data), static_cast(input.data) + input.size, tensor->data.uint8); + copy(input.begin(), input.end(), tensor->data.uint8); } - // Run the inference - TfLiteStatus invoke_status = interpreter.Invoke(); - if (invoke_status != kTfLiteOk) { - LOG_ERR("Invoke failed for inference: job=%s", job.name.c_str()); + return false; +} + +bool InferenceProcess::copyOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter) { + // Skip copy if output is empty + if (job.output.empty()) { + return false; + } + + if (interpreter.outputs_size() != job.output.size()) { + LOG_ERR("Output size mismatch: job=%zu, network=%u", job.output.size(), interpreter.outputs_size()); return true; } - LOG("arena_used_bytes : %zu\n", interpreter.arena_used_bytes()); + for (unsigned i = 0; i < interpreter.outputs_size(); ++i) { + DataPtr &output = job.output[i]; + TfLiteTensor *tensor = interpreter.output(i); - LOG("Inference runtime: %u cycles\n", (unsigned int)profiler.GetTotalTicks()); + if (tensor->bytes > output.size) { + LOG_ERR("Tensor size mismatch: tensor=%d, expected=%d", tensor->bytes, output.size); + return true; + } -#ifdef LAYER_BY_LAYER_PROFILER - if (job.pmuCycleCounterEnable) { - job.pmuCycleCounterCount = profiler.GetPmuCycleCounterCount(); + copy(tensor->data.uint8, tensor->data.uint8 + tensor->bytes, output.begin()); } - job.pmuEventCount.assign(profiler.GetPmuEventCount().begin(), profiler.GetPmuEventCount().end()); -#endif - // Copy output data - if (job.output.size() > 0) { - if (interpreter.outputs_size() != job.output.size()) { - LOG_ERR("Output size mismatch: job=%zu, network=%u", job.output.size(), interpreter.outputs_size()); + return false; +} + +bool InferenceProcess::compareOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter) { + // Skip verification if expected output is empty + if (job.expectedOutput.empty()) { + return false; + } + + if (job.expectedOutput.size() != interpreter.outputs_size()) { + LOG_ERR("Expected number of output tensors mismatch: job=%s, expected=%zu, network=%zu", + job.name.c_str(), + job.expectedOutput.size(), + interpreter.outputs_size()); + return true; + } + + for (unsigned int i = 0; i < interpreter.outputs_size(); i++) { + const DataPtr &expected = job.expectedOutput[i]; + const TfLiteTensor *output = interpreter.output(i); + + if (expected.size != output->bytes) { + LOG_ERR("Expected output tensor size mismatch: job=%s, index=%u, expected=%zu, network=%zu", + job.name.c_str(), + i, + expected.size, + output->bytes); return true; } - for (unsigned i = 0; i < interpreter.outputs_size(); ++i) { - if (copyOutput(*interpreter.output(i), job.output[i])) { + const char *exp = expected.begin(); + for (unsigned int j = 0; j < output->bytes; ++j) { + if (output->data.uint8[j] != exp[j]) { + LOG_ERR("Expected output tensor data mismatch: job=%s, index=%u, offset=%u, " + "expected=%02x, network=%02x\n", + job.name.c_str(), + i, + j, + exp[j], + output->data.uint8[j]); return true; } } } + return false; +} + +void InferenceProcess::printJob(InferenceJob &job, tflite::MicroInterpreter &interpreter) { + for (size_t i = 0; i < job.pmuEventCount.size(); i++) { + LOG("ethosu_pmu_cntr%zu : %" PRIu32 "\n", i, job.pmuEventCount[i]); + } + + LOG("arena_used_bytes : %zu\n", interpreter.arena_used_bytes()); + // Print all of the output data, or the first NUM_BYTES_TO_PRINT bytes, // whichever comes first as well as the output shape. LOG("num_of_outputs: %d\n", interpreter.outputs_size()); LOG("output_begin\n"); LOG("[\n"); + for (unsigned int i = 0; i < interpreter.outputs_size(); i++) { - TfLiteTensor *output = interpreter.output(i); - print_output_data(output, job.numBytesToPrint); + printOutputTensor(interpreter.output(i), job.numBytesToPrint); + if (i != interpreter.outputs_size() - 1) { LOG(",\n"); } } + LOG("]\n"); LOG("output_end\n"); +} - if (job.expectedOutput.size() > 0) { - if (job.expectedOutput.size() != interpreter.outputs_size()) { - LOG_ERR("Expected number of output tensors mismatch: job=%s, expected=%zu, network=%zu", - job.name.c_str(), - job.expectedOutput.size(), - interpreter.outputs_size()); - return true; - } +void InferenceProcess::printOutputTensor(TfLiteTensor *output, size_t bytesToPrint) { + constexpr auto crc = Crc(); + const uint32_t crc32 = crc.crc32(output->data.data, output->bytes); + const int numBytesToPrint = min(output->bytes, bytesToPrint); + int dims_size = output->dims->size; - for (unsigned int i = 0; i < interpreter.outputs_size(); i++) { - const DataPtr &expected = job.expectedOutput[i]; - const TfLiteTensor *output = interpreter.output(i); + LOG("{\n"); + LOG("\"dims\": [%d,", dims_size); - if (expected.size != output->bytes) { - LOG_ERR("Expected output tensor size mismatch: job=%s, index=%u, expected=%zu, network=%zu", - job.name.c_str(), - i, - expected.size, - output->bytes); - return true; - } + for (int i = 0; i < output->dims->size - 1; ++i) { + LOG("%d,", output->dims->data[i]); + } - for (unsigned int j = 0; j < output->bytes; ++j) { - if (output->data.uint8[j] != static_cast(expected.data)[j]) { - LOG_ERR("Expected output tensor data mismatch: job=%s, index=%u, offset=%u, " - "expected=%02x, network=%02x\n", - job.name.c_str(), - i, - j, - static_cast(expected.data)[j], - output->data.uint8[j]); - return true; - } + LOG("%d],\n", output->dims->data[dims_size - 1]); + LOG("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data); + + if (numBytesToPrint) { + LOG("\"crc32\": \"%08" PRIx32 "\",\n", crc32); + LOG("\"data\":\""); + + for (int i = 0; i < numBytesToPrint - 1; ++i) { + /* + * Workaround an issue when compiling with GCC where by + * printing only a '\n' the produced global output is wrong. + */ + if (i % 15 == 0 && i != 0) { + LOG("0x%02x,\n", output->data.uint8[i]); + } else { + LOG("0x%02x,", output->data.uint8[i]); } } + + LOG("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]); + } else { + LOG("\"crc32\": \"%08" PRIx32 "\"\n", crc32); } - LOG_INFO("Finished running job: %s", job.name.c_str()); + LOG("}"); +} - return false; +void InferenceProcess::tfluDebugLog(const char *s) { + LOG("%s", s); } } // namespace InferenceProcess diff --git a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp index 37bd868..a547576 100644 --- a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp +++ b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp @@ -38,7 +38,7 @@ public: int32_t event_id = EventID(EventLevelError, EvtStatistics_No, EventRecordNone)); uint32_t BeginEvent(const char *tag); void EndEvent(uint32_t event_handle); - uint64_t GetTotalTicks() const; + int32_t GetTotalTicks() const; void Log() const; uint64_t GetPmuCycleCounterCount() const; diff --git a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp index a5b8e5a..a4f67d6 100644 --- a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp +++ b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp @@ -40,13 +40,13 @@ uint64_t GetCurrentEthosuTicks(struct ethosu_driver *drv) { namespace tflite { LayerByLayerProfiler::LayerByLayerProfiler(const std::vector &event_config, - bool pmu_cycle_counter_enable, + bool _pmu_cycle_counter_enable, size_t max_events, - Backend backend, - int32_t event_id) : + Backend _backend, + int32_t _event_id) : pmu_event_config(event_config), - pmu_event_count(), pmu_cycle_counter_enable(pmu_cycle_counter_enable), pmu_cycle_counter_count(0), - max_events_(max_events), backend(backend), event_id(event_id), num_events_(0) { + pmu_event_count(), pmu_cycle_counter_enable(_pmu_cycle_counter_enable), pmu_cycle_counter_count(0), + max_events_(max_events), backend(_backend), event_id(_event_id), num_events_(0) { tags_ = std::make_unique(max_events); start_ticks_ = std::make_unique(max_events); @@ -67,7 +67,7 @@ uint32_t LayerByLayerProfiler::BeginEvent(const char *tag) { size_t numEventCounters = ETHOSU_PMU_Get_NumEventCounters(); if (pmu_event_config.size() > numEventCounters) { - LOG_WARN("PMU event config list is bigger (%lu) than available PMU event counters (%lu)", + LOG_WARN("PMU event config list is bigger (%zu) than available PMU event counters (%zu)", pmu_event_config.size(), numEventCounters); LOG_WARN("PMU event config list will be truncated"); @@ -135,7 +135,7 @@ void LayerByLayerProfiler::EndEvent(uint32_t event_handle) { if (backend == PRINTF) { if (strcmp("ethos-u", tags_[event_handle]) == 0) { for (size_t i = 0; i < pmu_event_count.size(); i++) { - LOG("ethos-u : ethosu_pmu_cntr%lu : %u\n", i, pmu_event_count[i]); + LOG("ethos-u : ethosu_pmu_cntr%zu : %" PRIu32 "\n", i, pmu_event_count[i]); } LOG("ethos-u : cycle_cnt : %" PRIu64 " cycles\n", pmu_cycle_counter_count); } else { @@ -148,11 +148,11 @@ void LayerByLayerProfiler::EndEvent(uint32_t event_handle) { } } -uint64_t LayerByLayerProfiler::GetTotalTicks() const { - uint64_t ticks = 0; +int32_t LayerByLayerProfiler::GetTotalTicks() const { + int32_t ticks = 0; for (size_t i = 0; i < num_events_; ++i) { - ticks += end_ticks_[i] - start_ticks_[i]; + ticks += static_cast(end_ticks_[i] - start_ticks_[i]); } return ticks; -- cgit v1.2.1