From dcc1ce0f9b64198f3bb82d9858ec79663808a35f Mon Sep 17 00:00:00 2001
From: Kristofer Jonsson <kristofer.jonsson@arm.com>
Date: Tue, 21 Dec 2021 16:25:19 +0100
Subject: Inference process update

Split runJob() into smaller functions to make the code easier to
read and maintain.

Correct return type of Layer By Layer profiler GetTotalTicks().

Change-Id: Ie414522017b3b6c6da9a09182439dbe4b2efdb1f
---
 .../include/inference_process.hpp                  |  18 ++
 .../inference_process/src/inference_process.cpp    | 279 ++++++++++++---------
 .../include/layer_by_layer_profiler.hpp            |   2 +-
 .../src/layer_by_layer_profiler.cpp                |  20 +-
 4 files changed, 193 insertions(+), 126 deletions(-)
diff --git a/applications/inference_process/include/inference_process.hpp b/applications/inference_process/include/inference_process.hpp
index b8b2775..6ab453c 100644
--- a/applications/inference_process/include/inference_process.hpp
+++ b/applications/inference_process/include/inference_process.hpp
@@ -24,6 +24,14 @@
 #include <string>
 #include <vector>
 
+struct TfLiteTensor;
+
+namespace tflite {
+// Forward declarations
+class MicroInterpreter;
+class MicroResourceVariables;
+} // namespace tflite
+
 namespace InferenceProcess {
 struct DataPtr {
     void *data;
@@ -33,6 +41,9 @@ struct DataPtr {
 
     void invalidate();
     void clean();
+
+    char *begin() const;
+    char *end() const;
 };
 
 struct InferenceJob {
@@ -68,6 +79,13 @@ public:
     bool runJob(InferenceJob &job);
 
 private:
+    static bool copyIfm(InferenceJob &job, tflite::MicroInterpreter &interpreter);
+    static bool copyOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter);
+    static bool compareOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter);
+    static void printJob(InferenceJob &job, tflite::MicroInterpreter &interpreter);
+    static void printOutputTensor(TfLiteTensor *output, size_t bytesToPrint);
+    static void tfluDebugLog(const char *s);
+
     uint8_t *tensorArena;
     const size_t tensorArenaSize;
 };
diff --git a/applications/inference_process/src/inference_process.cpp b/applications/inference_process/src/inference_process.cpp
index 499cdd3..ebd9d6c 100644
--- a/applications/inference_process/src/inference_process.cpp
+++ b/applications/inference_process/src/inference_process.cpp
@@ -39,10 +39,6 @@ using namespace std;
 
 namespace {
 
-void tflu_debug_log(const char *s) {
-    LOG("%s", s);
-}
-
 class Crc {
 public:
     constexpr Crc() : table() {
@@ -79,55 +75,6 @@ private:
     uint32_t table[256];
 };
 
-void print_output_data(TfLiteTensor *output, size_t bytesToPrint) {
-    constexpr auto crc          = Crc();
-    const uint32_t output_crc32 = crc.crc32(output->data.data, output->bytes);
-    const int numBytesToPrint   = min(output->bytes, bytesToPrint);
-    int dims_size               = output->dims->size;
-    LOG("{\n");
-    LOG("\"dims\": [%d,", dims_size);
-    for (int i = 0; i < output->dims->size - 1; ++i) {
-        LOG("%d,", output->dims->data[i]);
-    }
-    LOG("%d],\n", output->dims->data[dims_size - 1]);
-    LOG("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data);
-    if (numBytesToPrint) {
-        LOG("\"crc32\": \"%08" PRIx32 "\",\n", output_crc32);
-        LOG("\"data\":\"");
-        for (int i = 0; i < numBytesToPrint - 1; ++i) {
-            /*
-             * Workaround an issue when compiling with GCC where by
-             * printing only a '\n' the produced global output is wrong.
-             */
-            if (i % 15 == 0 && i != 0) {
-                LOG("0x%02x,\n", output->data.uint8[i]);
-            } else {
-                LOG("0x%02x,", output->data.uint8[i]);
-            }
-        }
-        LOG("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]);
-    } else {
-        LOG("\"crc32\": \"%08" PRIx32 "\"\n", output_crc32);
-    }
-    LOG("}");
-}
-
-bool copyOutput(const TfLiteTensor &src, InferenceProcess::DataPtr &dst) {
-    if (dst.data == nullptr) {
-        return false;
-    }
-
-    if (src.bytes > dst.size) {
-        LOG_ERR("Tensor size mismatch (bytes): actual=%d, expected%d.", src.bytes, dst.size);
-        return true;
-    }
-
-    copy(src.data.uint8, src.data.uint8 + src.bytes, static_cast<uint8_t *>(dst.data));
-    dst.size = src.bytes;
-
-    return false;
-}
-
 } // namespace
 
 namespace InferenceProcess {
@@ -145,6 +92,14 @@ void DataPtr::clean() {
 #endif
 }
 
+char *DataPtr::begin() const {
+    return static_cast<char *>(data);
+}
+
+char *DataPtr::end() const {
+    return static_cast<char *>(data) + size;
+}
+
 InferenceJob::InferenceJob() : numBytesToPrint(0) {}
 
 InferenceJob::InferenceJob(const string &_name,
@@ -199,10 +154,7 @@ bool InferenceProcess::runJob(InferenceJob &job) {
     LOG_INFO("Running inference job: %s", job.name.c_str());
 
     // Register debug log callback for profiling
-    RegisterDebugLogCallback(tflu_debug_log);
-
-    tflite::MicroErrorReporter microErrorReporter;
-    tflite::ErrorReporter *reporter = &microErrorReporter;
+    RegisterDebugLogCallback(tfluDebugLog);
 
     // Get model handle and verify that the version is correct
     const tflite::Model *model = ::tflite::GetModel(job.networkModel.data);
@@ -221,15 +173,57 @@ bool InferenceProcess::runJob(InferenceJob &job) {
     tflite::ArmProfiler profiler;
 #endif
 
-    tflite::MicroInterpreter interpreter(model, resolver, tensorArena, tensorArenaSize, reporter, nullptr, &profiler);
+    tflite::MicroErrorReporter errorReporter;
+    tflite::MicroInterpreter interpreter(
+        model, resolver, tensorArena, tensorArenaSize, &errorReporter, nullptr, &profiler);
 
     // Allocate tensors
-    TfLiteStatus allocate_status = interpreter.AllocateTensors();
-    if (allocate_status != kTfLiteOk) {
+    TfLiteStatus status = interpreter.AllocateTensors();
+    if (status != kTfLiteOk) {
         LOG_ERR("Failed to allocate tensors for inference: job=%s", job.name.c_str());
         return true;
     }
 
+    // Copy IFM data from job descriptor to TFLu arena
+    if (copyIfm(job, interpreter)) {
+        return true;
+    }
+
+    // Run the inference
+    status = interpreter.Invoke();
+    if (status != kTfLiteOk) {
+        LOG_ERR("Invoke failed for inference: job=%s", job.name.c_str());
+        return true;
+    }
+
+#ifdef LAYER_BY_LAYER_PROFILER
+    if (job.pmuCycleCounterEnable) {
+        job.pmuCycleCounterCount = profiler.GetPmuCycleCounterCount();
+    }
+
+    job.pmuEventCount.assign(profiler.GetPmuEventCount().begin(), profiler.GetPmuEventCount().end());
+#endif
+
+    LOG("Inference runtime: %" PRId32 " cycles\n", profiler.GetTotalTicks());
+
+    // Copy output data from TFLu arena to job descriptor
+    if (copyOfm(job, interpreter)) {
+        return true;
+    }
+
+    printJob(job, interpreter);
+
+    // Compare the OFM with the expected reference data
+    if (compareOfm(job, interpreter)) {
+        return true;
+    }
+
+    LOG_INFO("Finished running job: %s", job.name.c_str());
+
+    return false;
+}
+
+bool InferenceProcess::copyIfm(InferenceJob &job, tflite::MicroInterpreter &interpreter) {
     // Create a filtered list of non empty input tensors
     vector<TfLiteTensor *> inputTensors;
     for (size_t i = 0; i < interpreter.inputs_size(); ++i) {
@@ -239,6 +233,7 @@ bool InferenceProcess::runJob(InferenceJob &job) {
             inputTensors.push_back(tensor);
         }
     }
+
     if (job.input.size() != inputTensors.size()) {
         LOG_ERR("Number of input buffers does not match number of non empty network tensors: input=%zu, network=%zu",
                 job.input.size(),
@@ -246,10 +241,10 @@ bool InferenceProcess::runJob(InferenceJob &job) {
         return true;
     }
 
-    // Copy input data
+    // Copy input data from job to TFLu arena
     for (size_t i = 0; i < inputTensors.size(); ++i) {
-        const DataPtr &input       = job.input[i];
-        const TfLiteTensor *tensor = inputTensors[i];
+        DataPtr &input       = job.input[i];
+        TfLiteTensor *tensor = inputTensors[i];
 
         if (input.size != tensor->bytes) {
             LOG_ERR("Job input size does not match network input size: job=%s, index=%zu, input=%zu, network=%u",
@@ -260,96 +255,150 @@ bool InferenceProcess::runJob(InferenceJob &job) {
             return true;
         }
 
-        copy(static_cast<char *>(input.data), static_cast<char *>(input.data) + input.size, tensor->data.uint8);
+        copy(input.begin(), input.end(), tensor->data.uint8);
     }
 
-    // Run the inference
-    TfLiteStatus invoke_status = interpreter.Invoke();
-    if (invoke_status != kTfLiteOk) {
-        LOG_ERR("Invoke failed for inference: job=%s", job.name.c_str());
+    return false;
+}
+
+bool InferenceProcess::copyOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter) {
+    // Skip copy if output is empty
+    if (job.output.empty()) {
+        return false;
+    }
+
+    if (interpreter.outputs_size() != job.output.size()) {
+        LOG_ERR("Output size mismatch: job=%zu, network=%u", job.output.size(), interpreter.outputs_size());
         return true;
     }
 
-    LOG("arena_used_bytes : %zu\n", interpreter.arena_used_bytes());
+    for (unsigned i = 0; i < interpreter.outputs_size(); ++i) {
+        DataPtr &output      = job.output[i];
+        TfLiteTensor *tensor = interpreter.output(i);
 
-    LOG("Inference runtime: %u cycles\n", (unsigned int)profiler.GetTotalTicks());
+        if (tensor->bytes > output.size) {
+            LOG_ERR("Tensor size mismatch: tensor=%d, expected=%d", tensor->bytes, output.size);
+            return true;
+        }
 
-#ifdef LAYER_BY_LAYER_PROFILER
-    if (job.pmuCycleCounterEnable) {
-        job.pmuCycleCounterCount = profiler.GetPmuCycleCounterCount();
+        copy(tensor->data.uint8, tensor->data.uint8 + tensor->bytes, output.begin());
     }
-    job.pmuEventCount.assign(profiler.GetPmuEventCount().begin(), profiler.GetPmuEventCount().end());
-#endif
 
-    // Copy output data
-    if (job.output.size() > 0) {
-        if (interpreter.outputs_size() != job.output.size()) {
-            LOG_ERR("Output size mismatch: job=%zu, network=%u", job.output.size(), interpreter.outputs_size());
+    return false;
+}
+
+bool InferenceProcess::compareOfm(InferenceJob &job, tflite::MicroInterpreter &interpreter) {
+    // Skip verification if expected output is empty
+    if (job.expectedOutput.empty()) {
+        return false;
+    }
+
+    if (job.expectedOutput.size() != interpreter.outputs_size()) {
+        LOG_ERR("Expected number of output tensors mismatch: job=%s, expected=%zu, network=%zu",
+                job.name.c_str(),
+                job.expectedOutput.size(),
+                interpreter.outputs_size());
+        return true;
+    }
+
+    for (unsigned int i = 0; i < interpreter.outputs_size(); i++) {
+        const DataPtr &expected    = job.expectedOutput[i];
+        const TfLiteTensor *output = interpreter.output(i);
+
+        if (expected.size != output->bytes) {
+            LOG_ERR("Expected output tensor size mismatch: job=%s, index=%u, expected=%zu, network=%zu",
+                    job.name.c_str(),
+                    i,
+                    expected.size,
+                    output->bytes);
             return true;
         }
 
-        for (unsigned i = 0; i < interpreter.outputs_size(); ++i) {
-            if (copyOutput(*interpreter.output(i), job.output[i])) {
+        const char *exp = expected.begin();
+        for (unsigned int j = 0; j < output->bytes; ++j) {
+            if (output->data.uint8[j] != exp[j]) {
+                LOG_ERR("Expected output tensor data mismatch: job=%s, index=%u, offset=%u, "
+                        "expected=%02x, network=%02x\n",
+                        job.name.c_str(),
+                        i,
+                        j,
+                        exp[j],
+                        output->data.uint8[j]);
                 return true;
             }
         }
     }
 
+    return false;
+}
+
+void InferenceProcess::printJob(InferenceJob &job, tflite::MicroInterpreter &interpreter) {
+    for (size_t i = 0; i < job.pmuEventCount.size(); i++) {
+        LOG("ethosu_pmu_cntr%zu : %" PRIu32 "\n", i, job.pmuEventCount[i]);
+    }
+
+    LOG("arena_used_bytes : %zu\n", interpreter.arena_used_bytes());
+
     // Print all of the output data, or the first NUM_BYTES_TO_PRINT bytes,
     // whichever comes first as well as the output shape.
     LOG("num_of_outputs: %d\n", interpreter.outputs_size());
     LOG("output_begin\n");
     LOG("[\n");
+
     for (unsigned int i = 0; i < interpreter.outputs_size(); i++) {
-        TfLiteTensor *output = interpreter.output(i);
-        print_output_data(output, job.numBytesToPrint);
+        printOutputTensor(interpreter.output(i), job.numBytesToPrint);
+
         if (i != interpreter.outputs_size() - 1) {
             LOG(",\n");
         }
     }
+
     LOG("]\n");
     LOG("output_end\n");
+}
 
-    if (job.expectedOutput.size() > 0) {
-        if (job.expectedOutput.size() != interpreter.outputs_size()) {
-            LOG_ERR("Expected number of output tensors mismatch: job=%s, expected=%zu, network=%zu",
-                    job.name.c_str(),
-                    job.expectedOutput.size(),
-                    interpreter.outputs_size());
-            return true;
-        }
+void InferenceProcess::printOutputTensor(TfLiteTensor *output, size_t bytesToPrint) {
+    constexpr auto crc        = Crc();
+    const uint32_t crc32      = crc.crc32(output->data.data, output->bytes);
+    const int numBytesToPrint = min(output->bytes, bytesToPrint);
+    int dims_size             = output->dims->size;
 
-        for (unsigned int i = 0; i < interpreter.outputs_size(); i++) {
-            const DataPtr &expected    = job.expectedOutput[i];
-            const TfLiteTensor *output = interpreter.output(i);
+    LOG("{\n");
+    LOG("\"dims\": [%d,", dims_size);
 
-            if (expected.size != output->bytes) {
-                LOG_ERR("Expected output tensor size mismatch: job=%s, index=%u, expected=%zu, network=%zu",
-                        job.name.c_str(),
-                        i,
-                        expected.size,
-                        output->bytes);
-                return true;
-            }
+    for (int i = 0; i < output->dims->size - 1; ++i) {
+        LOG("%d,", output->dims->data[i]);
+    }
 
-            for (unsigned int j = 0; j < output->bytes; ++j) {
-                if (output->data.uint8[j] != static_cast<uint8_t *>(expected.data)[j]) {
-                    LOG_ERR("Expected output tensor data mismatch: job=%s, index=%u, offset=%u, "
-                            "expected=%02x, network=%02x\n",
-                            job.name.c_str(),
-                            i,
-                            j,
-                            static_cast<uint8_t *>(expected.data)[j],
-                            output->data.uint8[j]);
-                    return true;
-                }
+    LOG("%d],\n", output->dims->data[dims_size - 1]);
+    LOG("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data);
+
+    if (numBytesToPrint) {
+        LOG("\"crc32\": \"%08" PRIx32 "\",\n", crc32);
+        LOG("\"data\":\"");
+
+        for (int i = 0; i < numBytesToPrint - 1; ++i) {
+            /*
+             * Workaround an issue when compiling with GCC where by
+             * printing only a '\n' the produced global output is wrong.
+             */
+            if (i % 15 == 0 && i != 0) {
+                LOG("0x%02x,\n", output->data.uint8[i]);
+            } else {
+                LOG("0x%02x,", output->data.uint8[i]);
             }
         }
+
+        LOG("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]);
+    } else {
+        LOG("\"crc32\": \"%08" PRIx32 "\"\n", crc32);
     }
 
-    LOG_INFO("Finished running job: %s", job.name.c_str());
+    LOG("}");
+}
 
-    return false;
+void InferenceProcess::tfluDebugLog(const char *s) {
+    LOG("%s", s);
 }
 
 } // namespace InferenceProcess
diff --git a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
index 37bd868..a547576 100644
--- a/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
+++ b/lib/layer_by_layer_profiler/include/layer_by_layer_profiler.hpp
@@ -38,7 +38,7 @@ public:
                          int32_t event_id = EventID(EventLevelError, EvtStatistics_No, EventRecordNone));
     uint32_t BeginEvent(const char *tag);
     void EndEvent(uint32_t event_handle);
-    uint64_t GetTotalTicks() const;
+    int32_t GetTotalTicks() const;
     void Log() const;
 
     uint64_t GetPmuCycleCounterCount() const;
diff --git a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
index a5b8e5a..a4f67d6 100644
--- a/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
+++ b/lib/layer_by_layer_profiler/src/layer_by_layer_profiler.cpp
@@ -40,13 +40,13 @@ uint64_t GetCurrentEthosuTicks(struct ethosu_driver *drv) {
 namespace tflite {
 
 LayerByLayerProfiler::LayerByLayerProfiler(const std::vector<uint8_t> &event_config,
-                                           bool pmu_cycle_counter_enable,
+                                           bool _pmu_cycle_counter_enable,
                                            size_t max_events,
-                                           Backend backend,
-                                           int32_t event_id) :
+                                           Backend _backend,
+                                           int32_t _event_id) :
     pmu_event_config(event_config),
-    pmu_event_count(), pmu_cycle_counter_enable(pmu_cycle_counter_enable), pmu_cycle_counter_count(0),
-    max_events_(max_events), backend(backend), event_id(event_id), num_events_(0) {
+    pmu_event_count(), pmu_cycle_counter_enable(_pmu_cycle_counter_enable), pmu_cycle_counter_count(0),
+    max_events_(max_events), backend(_backend), event_id(_event_id), num_events_(0) {
 
     tags_        = std::make_unique<const char *[]>(max_events);
     start_ticks_ = std::make_unique<uint64_t[]>(max_events);
@@ -67,7 +67,7 @@ uint32_t LayerByLayerProfiler::BeginEvent(const char *tag) {
         size_t numEventCounters   = ETHOSU_PMU_Get_NumEventCounters();
 
         if (pmu_event_config.size() > numEventCounters) {
-            LOG_WARN("PMU event config list is bigger (%lu) than available PMU event counters (%lu)",
+            LOG_WARN("PMU event config list is bigger (%zu) than available PMU event counters (%zu)",
                      pmu_event_config.size(),
                      numEventCounters);
             LOG_WARN("PMU event config list will be truncated");
@@ -135,7 +135,7 @@ void LayerByLayerProfiler::EndEvent(uint32_t event_handle) {
     if (backend == PRINTF) {
         if (strcmp("ethos-u", tags_[event_handle]) == 0) {
             for (size_t i = 0; i < pmu_event_count.size(); i++) {
-                LOG("ethos-u : ethosu_pmu_cntr%lu : %u\n", i, pmu_event_count[i]);
+                LOG("ethos-u : ethosu_pmu_cntr%zu : %" PRIu32 "\n", i, pmu_event_count[i]);
             }
             LOG("ethos-u : cycle_cnt : %" PRIu64 " cycles\n", pmu_cycle_counter_count);
         } else {
@@ -148,11 +148,11 @@ void LayerByLayerProfiler::EndEvent(uint32_t event_handle) {
     }
 }
 
-uint64_t LayerByLayerProfiler::GetTotalTicks() const {
-    uint64_t ticks = 0;
+int32_t LayerByLayerProfiler::GetTotalTicks() const {
+    int32_t ticks = 0;
 
     for (size_t i = 0; i < num_events_; ++i) {
-        ticks += end_ticks_[i] - start_ticks_[i];
+        ticks += static_cast<int32_t>(end_ticks_[i] - start_ticks_[i]);
     }
 
     return ticks;
-- 
cgit v1.2.1