From 6a7703ea93b962869a78976811b8a5920f298f67 Mon Sep 17 00:00:00 2001
From: Anton Moberg <anton.moberg@arm.com>
Date: Tue, 2 Mar 2021 15:14:29 +0100
Subject: Multi NPU freertos example

Modified freertos.cpp to the support multi NPUs API.

Spawn NUM_INFERENCE_TASKS inference processing tasks (Reserves, locks,
and runs inference on an available NPU driver. Usually 1 per NPU, but
not required)

Spawn NUM_JOB_TASKS job creating/recieving tasks (Creates
NUM_JOBS_PER_TASK jobs, sends
them to inference job queue, and recieves response once finished)

Added: Mutex & Semaphore interfaces. Overrides weakly linked symbols in
driver.

Change-Id: Id187c6c9b43b988b1fdd560fdf3d57bd4b30b79c
---
 applications/freertos/main.cpp | 277 ++++++++++++++++++++++++++---------------
 applications/freertos/model.h  |   2 +
 applications/freertos/output.h |   2 +-
 3 files changed, 183 insertions(+), 98 deletions(-)
diff --git a/applications/freertos/main.cpp b/applications/freertos/main.cpp
index 6f92faf..0010b70 100644
--- a/applications/freertos/main.cpp
+++ b/applications/freertos/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited. All rights reserved.
+ * Copyright (c) 2019-2021 Arm Limited. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -20,147 +20,230 @@
  * Includes
  ****************************************************************************/
 
-// FreeRTOS
 #include "FreeRTOS.h"
 #include "queue.h"
+#include "semphr.h"
 #include "task.h"
 
-// Ethos-U
-#include "ethosu_driver.h"
+#include <inttypes.h>
+#include <stdio.h>
+#include <vector>
+
 #include "inference_process.hpp"
 
-// System includes
-#include <stdio.h>
+// Model data (Defined & changable by modifiying compile definition in CMakeLists.txt)
+#include "input.h"
+#include "model.h"
+#include "output.h"
 
 using namespace std;
 using namespace InferenceProcess;
 
+/****************************************************************************
+ * Defines
+ ****************************************************************************/
+
+// Nr. of tasks to process inferences with. Task reserves driver & runs inference (Normally 1 per NPU, but not a must)
+#define NUM_INFERENCE_TASKS 1
+// Nr. of tasks to create jobs and recieve responses
+#define NUM_JOB_TASKS 1
+// Nr. of jobs to create per job task
+#define NUM_JOBS_PER_TASK 1
+
+// Tensor arena size
+#ifdef TENSOR_ARENA_SIZE // If defined in model.h
+#define TENSOR_ARENA_SIZE_PER_INFERENCE TENSOR_ARENA_SIZE
+#else // If not defined, use maximum available
+#define TENSOR_ARENA_SIZE_PER_INFERENCE 2000000 / NUM_INFERENCE_TASKS
+#endif
+
 /****************************************************************************
  * InferenceJob
  ****************************************************************************/
 
-#define TENSOR_ARENA_SIZE 0xa0000
+struct ProcessTaskParams {
+    ProcessTaskParams() {}
+    ProcessTaskParams(QueueHandle_t _queue, uint8_t *_tensorArena, size_t _arenaSize) :
+        queueHandle(_queue), tensorArena(_tensorArena), arenaSize(_arenaSize) {}
 
-__attribute__((section(".bss.tensor_arena"), aligned(16))) uint8_t inferenceProcessTensorArena[TENSOR_ARENA_SIZE];
+    QueueHandle_t queueHandle;
+    uint8_t *tensorArena;
+    size_t arenaSize;
+};
+
+// Number of total completed jobs, needed to exit application correctly if NUM_JOB_TASKS > 1
+static int totalCompletedJobs = 0;
 
-namespace {
+// TensorArena static initialisation
+static const size_t arenaSize = TENSOR_ARENA_SIZE_PER_INFERENCE;
+__attribute__((section(".bss.tensor_arena"), aligned(16)))
+uint8_t inferenceProcessTensorArena[NUM_INFERENCE_TASKS][arenaSize];
 
+// Wrapper around InferenceProcess::InferenceJob. Adds responseQueue and status for FreeRTOS multi-tasking purposes.
 struct xInferenceJob : public InferenceJob {
-    QueueHandle_t queue;
+    QueueHandle_t responseQueue;
     bool status;
 
-    xInferenceJob();
-    xInferenceJob(const string &name,
-                  const DataPtr &networkModel,
-                  const vector<DataPtr> &input,
-                  const vector<DataPtr> &output,
-                  const vector<DataPtr> &expectedOutput,
-                  size_t numBytesToPrint,
-                  const vector<uint8_t> &pmuEventConfig,
-                  const uint32_t pmuCycleCounterEnable,
-                  QueueHandle_t queue);
+    xInferenceJob() : InferenceJob(), responseQueue(nullptr), status(false) {}
+    xInferenceJob(const string &_name,
+                  const DataPtr &_networkModel,
+                  const vector<DataPtr> &_input,
+                  const vector<DataPtr> &_output,
+                  const vector<DataPtr> &_expectedOutput,
+                  const size_t _numBytesToPrint,
+                  const vector<uint8_t> &_pmuEventConfig,
+                  const uint32_t _pmuCycleCounterEnable,
+                  QueueHandle_t _queue) :
+        InferenceJob(_name,
+                     _networkModel,
+                     _input,
+                     _output,
+                     _expectedOutput,
+                     _numBytesToPrint,
+                     _pmuEventConfig,
+                     _pmuCycleCounterEnable),
+        responseQueue(_queue), status(false) {}
 };
 
-xInferenceJob::xInferenceJob() : InferenceJob(), queue(nullptr), status(false) {}
-
-xInferenceJob::xInferenceJob(const std::string &_name,
-                             const DataPtr &_networkModel,
-                             const std::vector<DataPtr> &_input,
-                             const std::vector<DataPtr> &_output,
-                             const std::vector<DataPtr> &_expectedOutput,
-                             size_t _numBytesToPrint,
-                             const vector<uint8_t> &_pmuEventConfig,
-                             const uint32_t _pmuCycleCounterEnable,
-                             QueueHandle_t _queue) :
-    InferenceJob(_name,
-                 _networkModel,
-                 _input,
-                 _output,
-                 _expectedOutput,
-                 _numBytesToPrint,
-                 _pmuEventConfig,
-                 _pmuCycleCounterEnable),
-    queue(_queue), status(false) {}
-
-} // namespace
-
 /****************************************************************************
- * Functions
+ * Mutex & Semaphore
+ * Overrides weak-linked symbols in ethosu_driver.c to implement thread handling
  ****************************************************************************/
 
-namespace {
+extern "C" {
 
-#include "model.h"
-#include "input.h"
-#include "output.h"
+void *ethosu_mutex_create(void) {
+    return xSemaphoreCreateMutex();
+}
 
-void inferenceProcessTask(void *pvParameters) {
-    QueueHandle_t queue = reinterpret_cast<QueueHandle_t>(pvParameters);
+void ethosu_mutex_lock(void *mutex) {
+    SemaphoreHandle_t handle = reinterpret_cast<SemaphoreHandle_t>(mutex);
+    xSemaphoreTake(handle, portMAX_DELAY);
+}
+
+void ethosu_mutex_unlock(void *mutex) {
+    SemaphoreHandle_t handle = reinterpret_cast<SemaphoreHandle_t>(mutex);
+    xSemaphoreGive(handle);
+}
 
-    class InferenceProcess inferenceProcess(inferenceProcessTensorArena, TENSOR_ARENA_SIZE);
+void *ethosu_semaphore_create(void) {
+    return xSemaphoreCreateBinary();
+}
 
-    while (true) {
-        xInferenceJob *job;
+void ethosu_semaphore_take(void *sem) {
+    SemaphoreHandle_t handle = reinterpret_cast<SemaphoreHandle_t>(sem);
+    xSemaphoreTake(handle, portMAX_DELAY);
+}
 
-        // Wait for inference job
-        xQueueReceive(queue, &job, portMAX_DELAY);
-        printf("Received inference job. job=%p, name=%s\n", job, job->name.c_str());
+void ethosu_semaphore_give(void *sem) {
+    SemaphoreHandle_t handle = reinterpret_cast<SemaphoreHandle_t>(sem);
+    xSemaphoreGive(handle);
+}
+}
 
-        bool status = inferenceProcess.runJob(*job);
-        job->status = status;
+/****************************************************************************
+ * Functions
+ ****************************************************************************/
 
-        // Return inference job response
-        xQueueSend(job->queue, &job, portMAX_DELAY);
-    }
+//  inferenceProcessTask - Run jobs from queue with available driver
+void inferenceProcessTask(void *pvParameters) {
+    ProcessTaskParams params = *reinterpret_cast<ProcessTaskParams *>(pvParameters);
+
+    class InferenceProcess inferenceProcess(params.tensorArena, params.arenaSize);
 
-    vTaskDelete(NULL);
+    for (;;) {
+        xInferenceJob *xJob;
+
+        xQueueReceive(params.queueHandle, &xJob, portMAX_DELAY);
+        bool status  = inferenceProcess.runJob(*xJob);
+        xJob->status = status;
+        xQueueSend(xJob->responseQueue, &xJob, portMAX_DELAY);
+    }
+    vTaskDelete(nullptr);
 }
 
-void inferenceJobTask(void *pvParameters) {
+//  inferenceSenderTask - Creates NUM_INFERNECE_JOBS jobs, queues them, and then listens for completion status
+void inferenceSenderTask(void *pvParameters) {
+    int ret = 0;
+
     QueueHandle_t inferenceProcessQueue = reinterpret_cast<QueueHandle_t>(pvParameters);
+    xInferenceJob jobs[NUM_JOBS_PER_TASK];
 
     // Create queue for response messages
-    QueueHandle_t senderQueue = xQueueCreate(10, sizeof(xInferenceJob *));
-
-    // Inference job
-    DataPtr networkModel(networkModelData, sizeof(networkModelData));
-    DataPtr input(inputData, sizeof(inputData));
-    DataPtr expected(expectedData, sizeof(expectedData));
-
-    xInferenceJob job;
-    xInferenceJob *j = &job;
-    job.name         = "mobilenet_v2";
-    job.networkModel = networkModel;
-    job.input.push_back(input);
-    job.expectedOutput.push_back(expected);
-    job.queue = senderQueue;
-
-    // Send job
-    printf("Sending inference job\n");
-    xQueueSend(inferenceProcessQueue, &j, portMAX_DELAY);
-
-    // Wait for response
-    xQueueReceive(senderQueue, &j, portMAX_DELAY);
-    printf("Received inference job response. status=%u\n", j->status);
-
-    exit(j->status);
+    QueueHandle_t senderQueue = xQueueCreate(NUM_JOBS_PER_TASK, sizeof(xInferenceJob *));
+
+    // Create and queue the jobs
+    for (int n = 0; n < NUM_JOBS_PER_TASK; n++) {
+        // Create job
+        xInferenceJob *job = &jobs[n];
+        job->name          = string(modelName);
+        job->networkModel  = DataPtr(networkModelData, sizeof(networkModelData));
+        job->input.push_back(DataPtr(inputData, sizeof(inputData)));
+        job->expectedOutput.push_back(DataPtr(expectedOutputData, sizeof(expectedOutputData)));
+        job->responseQueue = senderQueue;
+        // Send job
+        printf("Sending inference job: job=%p, name=%s\n", job, job->name.c_str());
+        xQueueSend(inferenceProcessQueue, &job, portMAX_DELAY);
+    }
+
+    // Listen for completion status
+    do {
+        xInferenceJob *pSendJob;
+        xQueueReceive(senderQueue, &pSendJob, portMAX_DELAY);
+        printf("inferenceSenderTask: received response for job: %s, status = %u\n",
+               pSendJob->name.c_str(),
+               pSendJob->status);
+
+        totalCompletedJobs++;
+        ret = (pSendJob->status);
+        if (pSendJob->status != 0) {
+            break;
+        }
+    } while (totalCompletedJobs < NUM_JOBS_PER_TASK * NUM_JOB_TASKS);
+
+    vQueueDelete(senderQueue);
+
+    printf("FreeRTOS application returning %d.\n", ret);
+    exit(ret);
 }
 
-} // namespace
+/****************************************************************************
+ * Application
+ ****************************************************************************/
 
-/* Keep the queue ouf of the stack sinde freertos resets it when the scheduler starts.*/
-QueueHandle_t inferenceProcessQueue;
+// Declare variables in global scope to avoid stack since FreeRTOS resets stack when the scheduler is started
+static QueueHandle_t inferenceProcessQueue;
+static ProcessTaskParams taskParams[NUM_INFERENCE_TASKS];
 
+// FreeRTOS application. NOTE: Additional tasks may require increased heap size.
 int main() {
-    // Inference process
-    inferenceProcessQueue = xQueueCreate(10, sizeof(xInferenceJob *));
-    xTaskCreate(inferenceProcessTask, "inferenceProcess", 2 * 1024, inferenceProcessQueue, 1, nullptr);
+    BaseType_t ret;
+    inferenceProcessQueue = xQueueCreate(NUM_JOBS_PER_TASK, sizeof(xInferenceJob *));
+
+    // inferenceSender tasks to create and queue the jobs
+    for (int n = 0; n < NUM_JOB_TASKS; n++) {
+        ret = xTaskCreate(inferenceSenderTask, "inferenceSenderTask", 2 * 1024, inferenceProcessQueue, 2, nullptr);
+        if (ret != pdPASS) {
+            printf("FreeRTOS: Failed to create 'inferenceSenderTask%i'\n", n);
+            exit(1);
+        }
+    }
 
-    // Inference job task
-    xTaskCreate(inferenceJobTask, "inferenceJob", 2 * 1024, inferenceProcessQueue, 2, nullptr);
+    // Create inferenceProcess tasks to process the queued jobs
+    for (int n = 0; n < NUM_INFERENCE_TASKS; n++) {
+        taskParams[n] = ProcessTaskParams(inferenceProcessQueue, inferenceProcessTensorArena[n], arenaSize);
+        ret           = xTaskCreate(inferenceProcessTask, "inferenceProcessTask", 3 * 1024, &taskParams[n], 3, nullptr);
+        if (ret != pdPASS) {
+            printf("FreeRTOS: Failed to create 'inferenceProcessTask%i'\n", n);
+            exit(1);
+        }
+    }
 
-    // Run the scheduler
+    // Start Scheduler
     vTaskStartScheduler();
 
+    printf("FreeRTOS application failed to initialise \n");
+    exit(1);
+
     return 0;
-}
+}
\ No newline at end of file
diff --git a/applications/freertos/model.h b/applications/freertos/model.h
index cd8f656..3b3078a 100644
--- a/applications/freertos/model.h
+++ b/applications/freertos/model.h
@@ -16,6 +16,8 @@
  * limitations under the License.
  */
 
+const char *modelName = "keyword_spotting_cnn_small_int8";
+
 uint8_t networkModelData[] __attribute__((section("network_model_sec"), aligned(16))) = {
   0x28, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00,
diff --git a/applications/freertos/output.h b/applications/freertos/output.h
index 48380b4..0aa7954 100644
--- a/applications/freertos/output.h
+++ b/applications/freertos/output.h
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-uint8_t expectedData[1001] __attribute__((aligned(4), section("expected_output_data_sec"))) = {
+uint8_t expectedOutputData[1001] __attribute__((aligned(4), section("expected_output_data_sec"))) = {
   0x32, 0x35, 0x45, 0x51, 0x55, 0x5b, 0x4c, 0x5a, 0x42, 0x39, 0x2f, 0x37,
   0x3b, 0x31, 0x3f, 0x3a, 0x36, 0x43, 0x36, 0x46, 0x32, 0x39, 0x32, 0x35,
   0x3b, 0x31, 0x48, 0x3d, 0x43, 0x3c, 0x3b, 0x43, 0x45, 0x30, 0x43, 0x52,
-- 
cgit v1.2.1