From 1a3bb928b3820b089198afb99bbf410b1f210700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonny=20Sv=C3=A4rd?= Date: Fri, 25 Feb 2022 16:28:21 +0100 Subject: Add driver async API Add an asynchronous API to the driver. The current synchronous API is now using the new async API internally. The main new functions are ethosu_invoke_async() and ethosu_wait(). Every successfull call to ethosu_invoke_async() must be followed by a call to ethosu_wait() to get the status of the inference. The wait function can be called in a blocking or non-blocking mode by specifying the `block` argument to true/false. The regular synchronous invoke function is implemented as a invoke_async followed by a wait(block=true) call. Short sommary of changes: - Add an internal ethosu_job struct to keep track of inference data and job state. - Use async API in blocking mode for normal flow - Change default semaphore implementation to binary type - Move error prints out of interrupt context - Move ethosu_inference_begin() callback to right before HW invoke - Always call ethosu_inference_end() callback, even in case of errors - On NPU error, do not keep NPU powered after being reset Change-Id: If4c3c46e3c6732a669e17251bd848dea5765a490 --- include/ethosu_driver.h | 47 ++++++- src/ethosu_device.h | 18 ++- src/ethosu_device_u55_u65.c | 24 ++-- src/ethosu_driver.c | 313 ++++++++++++++++++++++++++++---------------- 4 files changed, 271 insertions(+), 131 deletions(-) diff --git a/include/ethosu_driver.h b/include/ethosu_driver.h index ff8d1eb..bf6a578 100644 --- a/include/ethosu_driver.h +++ b/include/ethosu_driver.h @@ -48,17 +48,35 @@ extern "C" { // Forward declare struct ethosu_device; +enum ethosu_job_state +{ + ETHOSU_JOB_IDLE = 0, + ETHOSU_JOB_RUNNING, + ETHOSU_JOB_DONE +}; + +struct ethosu_job +{ + volatile enum ethosu_job_state state; + const void *custom_data_ptr; + int custom_data_size; + const uint64_t *base_addr; + const size_t *base_addr_size; + int num_base_addr; + void *user_arg; +}; + struct ethosu_driver { struct ethosu_device *dev; struct ethosu_driver *next; + struct ethosu_job job; void *semaphore; uint64_t fast_memory; size_t fast_memory_size; bool status_error; bool dev_power_always_on; bool reserved; - volatile bool irq_triggered; uint8_t clock_request; uint8_t power_request; }; @@ -158,6 +176,33 @@ int ethosu_invoke_v3(struct ethosu_driver *drv, #define ethosu_invoke(drv, custom_data_ptr, custom_data_size, base_addr, base_addr_size, num_base_addr) \ ethosu_invoke_v3(drv, custom_data_ptr, custom_data_size, base_addr, base_addr_size, num_base_addr, 0) +/** + * Invoke Vela command stream using async interface. + * Must be followed by call(s) to ethosu_wait() upon successful return. + * Returns + * -1 on error + * 0 on success + */ +int ethosu_invoke_async(struct ethosu_driver *drv, + const void *custom_data_ptr, + const int custom_data_size, + const uint64_t *base_addr, + const size_t *base_addr_size, + const int num_base_addr, + void *user_arg); + +/** + * Wait for inference to complete (block=true) + * Poll status or finish up if inference is complete (block=false) + * (This function is only intended to be used in conjuction with ethosu_invoke_async) + * Returns + * 1 on inference running (only for block=false) + * 0 on inference success + * -1 on inference error + * -2 on inference not invoked + */ +int ethosu_wait(struct ethosu_driver *drv, bool block); + /** * Set Ethos-U power mode. */ diff --git a/src/ethosu_device.h b/src/ethosu_device.h index cc9ea8b..0b45fd6 100644 --- a/src/ethosu_device.h +++ b/src/ethosu_device.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. All rights reserved. + * Copyright (c) 2019-2022 Arm Limited. All rights reserved. * * SPDX-License-Identifier: Apache-2.0 * @@ -81,13 +81,17 @@ enum ethosu_error_codes ethosu_dev_axi_init(struct ethosu_device *dev); * - All input tensors * - All output tensors * \param[in] num_base_addr Number of base addresses. - * \return \ref ethosu_error_codes */ -enum ethosu_error_codes ethosu_dev_run_command_stream(struct ethosu_device *dev, - const uint8_t *cmd_stream_ptr, - uint32_t cms_length, - const uint64_t *base_addr, - int num_base_addr); +void ethosu_dev_run_command_stream(struct ethosu_device *dev, + const uint8_t *cmd_stream_ptr, + uint32_t cms_length, + const uint64_t *base_addr, + int num_base_addr); + +/** + * Print information on NPU error status + */ +void ethosu_dev_print_err_status(struct ethosu_device *dev); /** * Interrupt handler on device layer diff --git a/src/ethosu_device_u55_u65.c b/src/ethosu_device_u55_u65.c index 9a92f63..31379fc 100644 --- a/src/ethosu_device_u55_u65.c +++ b/src/ethosu_device_u55_u65.c @@ -133,11 +133,11 @@ enum ethosu_error_codes ethosu_dev_axi_init(struct ethosu_device *dev) return ETHOSU_SUCCESS; } -enum ethosu_error_codes ethosu_dev_run_command_stream(struct ethosu_device *dev, - const uint8_t *cmd_stream_ptr, - uint32_t cms_length, - const uint64_t *base_addr, - int num_base_addr) +void ethosu_dev_run_command_stream(struct ethosu_device *dev, + const uint8_t *cmd_stream_ptr, + uint32_t cms_length, + const uint64_t *base_addr, + int num_base_addr) { assert(num_base_addr <= NPU_REG_BASEP_ARRLEN); @@ -168,8 +168,14 @@ enum ethosu_error_codes ethosu_dev_run_command_stream(struct ethosu_device *dev, dev->reg->CMD.word = cmd.word; LOG_DEBUG("CMD=0x%08x", cmd.word); +} - return ETHOSU_SUCCESS; +void ethosu_dev_print_err_status(struct ethosu_device *dev) +{ + LOG_ERR("NPU status=0x%08" PRIx32 ", qread=%" PRIu32 ", cmd_end_reached=%d", + dev->reg->STATUS.word, + dev->reg->QREAD.word, + dev->reg->STATUS.cmd_end_reached); } bool ethosu_dev_handle_interrupt(struct ethosu_device *dev) @@ -185,12 +191,6 @@ bool ethosu_dev_handle_interrupt(struct ethosu_device *dev) if (dev->reg->STATUS.bus_status || dev->reg->STATUS.cmd_parse_error || dev->reg->STATUS.wd_fault || dev->reg->STATUS.ecc_fault || !dev->reg->STATUS.cmd_end_reached) { - LOG_ERR("NPU fault. status=0x%08" PRIx32 ", qread=%" PRIu32 ", cmd_end_reached=%d", - dev->reg->STATUS.word, - dev->reg->QREAD.word, - dev->reg->STATUS.cmd_end_reached); - ethosu_dev_soft_reset(dev); - ethosu_dev_set_clock_and_power(dev, ETHOSU_CLOCK_Q_UNCHANGED, ETHOSU_POWER_Q_DISABLE); return false; } diff --git a/src/ethosu_driver.c b/src/ethosu_driver.c index 866b94c..9175991 100644 --- a/src/ethosu_driver.c +++ b/src/ethosu_driver.c @@ -147,7 +147,7 @@ void __attribute__((weak)) ethosu_invalidate_dcache(uint32_t *p, size_t bytes) struct ethosu_semaphore_t { - int count; + uint8_t count; }; static void *ethosu_mutex; @@ -177,7 +177,7 @@ void __attribute__((weak)) ethosu_mutex_unlock(void *mutex) void *__attribute__((weak)) ethosu_semaphore_create(void) { struct ethosu_semaphore_t *sem = malloc(sizeof(*sem)); - sem->count = 1; + sem->count = 0; return sem; } @@ -190,18 +190,18 @@ void __attribute__((weak)) ethosu_semaphore_destroy(void *sem) void __attribute__((weak)) ethosu_semaphore_take(void *sem) { struct ethosu_semaphore_t *s = sem; - while (s->count <= 0) + while (s->count == 0) { __WFE(); } - s->count--; + s->count = 0; } // Baremetal simulation of giving a semaphore and waking up processes using intrinsics void __attribute__((weak)) ethosu_semaphore_give(void *sem) { struct ethosu_semaphore_t *s = sem; - s->count++; + s->count = 1; __SEV(); } @@ -224,20 +224,6 @@ void __attribute__((weak)) ethosu_inference_end(struct ethosu_driver *drv, void /****************************************************************************** * Static functions ******************************************************************************/ -static inline void wait_for_irq(struct ethosu_driver *drv) -{ - while (1) - { - if (drv->irq_triggered) - { - drv->irq_triggered = false; - break; - } - - ethosu_semaphore_take(drv->semaphore); - } -} - static void ethosu_register_driver(struct ethosu_driver *drv) { // Register driver as new HEAD of list @@ -290,6 +276,11 @@ static struct ethosu_driver *ethosu_find_and_reserve_driver(void) return NULL; } +static void ethosu_reset_job(struct ethosu_driver *drv) +{ + memset(&drv->job, 0, sizeof(struct ethosu_job)); +} + static int handle_optimizer_config(struct ethosu_driver *drv, struct opt_cfg_s *opt_cfg_p) { LOG_INFO("Optimizer release nbr: %d patch: %d", opt_cfg_p->da_data.rel_nbr, opt_cfg_p->da_data.patch_nbr); @@ -302,12 +293,7 @@ static int handle_optimizer_config(struct ethosu_driver *drv, struct opt_cfg_s * return 0; } -static int handle_command_stream(struct ethosu_driver *drv, - const uint8_t *cmd_stream, - const int cms_length, - const uint64_t *base_addr, - const size_t *base_addr_size, - const int num_base_addr) +static int handle_command_stream(struct ethosu_driver *drv, const uint8_t *cmd_stream, const int cms_length) { uint32_t cms_bytes = cms_length * BYTES_IN_32_BITS; ptrdiff_t cmd_stream_ptr = (ptrdiff_t)cmd_stream; @@ -321,27 +307,28 @@ static int handle_command_stream(struct ethosu_driver *drv, } // Verify 16 byte alignment for base address' - for (int i = 0; i < num_base_addr; i++) + for (int i = 0; i < drv->job.num_base_addr; i++) { - if (0 != (base_addr[i] & MASK_16_BYTE_ALIGN)) + if (0 != (drv->job.base_addr[i] & MASK_16_BYTE_ALIGN)) { - LOG_ERR("Base addr %d: 0x%llx not aligned to 16 bytes", i, base_addr[i]); + LOG_ERR("Base addr %d: 0x%llx not aligned to 16 bytes", i, drv->job.base_addr[i]); return -1; } } - /* Flush the cache if available on CPU. - * The upcasting to uin32_t* is ok since the pointer never is dereferenced. - * The base_addr_size is null if invoking from prior to invoke_V2, in that case - * the whole cache is being flushed. - */ + drv->job.state = ETHOSU_JOB_RUNNING; + + // Flush the cache if available on CPU. + // The upcasting to uin32_t* is ok since the pointer never is dereferenced. + // The base_addr_size is null if invoking from prior to invoke_V2, in that case + // the whole cache is being flushed. - if (base_addr_size != NULL) + if (drv->job.base_addr_size != NULL) { ethosu_flush_dcache((uint32_t *)cmd_stream_ptr, cms_bytes); - for (int i = 0; i < num_base_addr; i++) + for (int i = 0; i < drv->job.num_base_addr; i++) { - ethosu_flush_dcache((uint32_t *)(uintptr_t)base_addr[i], base_addr_size[i]); + ethosu_flush_dcache((uint32_t *)(uintptr_t)drv->job.base_addr[i], drv->job.base_addr_size[i]); } } else @@ -349,31 +336,23 @@ static int handle_command_stream(struct ethosu_driver *drv, ethosu_flush_dcache(NULL, 0); } - // Execute the command stream - if (ETHOSU_SUCCESS != ethosu_dev_run_command_stream(drv->dev, cmd_stream, cms_bytes, base_addr, num_base_addr)) + // Request power gating disabled during inference run + if (!drv->dev_power_always_on) { - return -1; - } - - wait_for_irq(drv); + // Will soft reset if security state or privilege level needs changing. + // Also note that any configurations done in the NPU prior to this point + // are lost in case power gating has been in effect. + set_clock_and_power_request(drv, ETHOSU_INFERENCE_REQUEST, ETHOSU_CLOCK_Q_ENABLE, ETHOSU_POWER_Q_DISABLE); - // Check if any error occured - if (drv->status_error) - { - return -1; + // Make sure AXI settings are applied + ethosu_dev_axi_init(drv->dev); } - if (base_addr_size != NULL) - { - for (int i = 0; i < num_base_addr; i++) - { - ethosu_invalidate_dcache((uint32_t *)(uintptr_t)base_addr[i], base_addr_size[i]); - } - } - else - { - ethosu_invalidate_dcache(NULL, 0); - } + // Inference begin callback + ethosu_inference_begin(drv, drv->job.user_arg); + + // Execute the command stream + ethosu_dev_run_command_stream(drv->dev, cmd_stream, cms_bytes, drv->job.base_addr, drv->job.num_base_addr); return 0; } @@ -385,7 +364,7 @@ void __attribute__((weak)) ethosu_irq_handler(struct ethosu_driver *drv) { LOG_DEBUG("Got interrupt from Ethos-U"); - drv->irq_triggered = true; + drv->job.state = ETHOSU_JOB_DONE; if (!ethosu_dev_handle_interrupt(drv->dev)) { drv->status_error = true; @@ -424,7 +403,6 @@ int ethosu_init(struct ethosu_driver *drv, drv->fast_memory = (uint32_t)fast_memory; drv->fast_memory_size = fast_memory_size; - drv->irq_triggered = false; // Initialize the device and set requested security state and privilege mode drv->dev = ethosu_dev_init(base_address, secure_enable, privilege_enable); @@ -476,33 +454,129 @@ void ethosu_get_hw_info(struct ethosu_driver *drv, struct ethosu_hw_info *hw) ethosu_dev_get_hw_info(drv->dev, hw); } -int ethosu_invoke_v3(struct ethosu_driver *drv, - const void *custom_data_ptr, - const int custom_data_size, - const uint64_t *base_addr, - const size_t *base_addr_size, - const int num_base_addr, - void *user_arg) +int ethosu_wait(struct ethosu_driver *drv, bool block) +{ + int ret = 0; + + switch (drv->job.state) + { + case ETHOSU_JOB_IDLE: + LOG_ERR("Inference job not running..."); + ret = -2; + break; + case ETHOSU_JOB_RUNNING: + if (!block) + { + // Inference still running, do not block + ret = 1; + break; + } + // fall through + case ETHOSU_JOB_DONE: + // Wait for interrupt in blocking mode. In non-blocking mode + // the interrupt has already triggered + ethosu_semaphore_take(drv->semaphore); + + // Inference done callback + ethosu_inference_end(drv, drv->job.user_arg); + + // Check NPU and interrupt status + if (drv->status_error) + { + LOG_ERR("NPU error(s) occured during inference."); + ethosu_dev_print_err_status(drv->dev); + + // Reset the NPU + (void)ethosu_dev_soft_reset(drv->dev); + // NPU is no longer in error state + drv->status_error = false; + + ret = -1; + } + + // Clear the clock/power gating disable request + if (!drv->dev_power_always_on) + { + // NOTE: Other requesters (like PMU) can be active, keeping + // clock/power gating disabled until no requests remain. + set_clock_and_power_request(drv, ETHOSU_INFERENCE_REQUEST, ETHOSU_CLOCK_Q_ENABLE, ETHOSU_POWER_Q_ENABLE); + } + + if (ret == 0) + { + // Invalidate cache + if (drv->job.base_addr_size != NULL) + { + for (int i = 0; i < drv->job.num_base_addr; i++) + { + ethosu_invalidate_dcache((uint32_t *)(uintptr_t)drv->job.base_addr[i], drv->job.base_addr_size[i]); + } + } + else + { + ethosu_invalidate_dcache(NULL, 0); + } + + LOG_DEBUG("Inference finished successfully..."); + } + + // Reset internal job (state resets to IDLE) + ethosu_reset_job(drv); + break; + + default: + LOG_ERR("Unexpected job state"); + ethosu_reset_job(drv); + ret = -1; + break; + } + + // Return inference job status + return ret; +} + +int ethosu_invoke_async(struct ethosu_driver *drv, + const void *custom_data_ptr, + const int custom_data_size, + const uint64_t *base_addr, + const size_t *base_addr_size, + const int num_base_addr, + void *user_arg) { + const struct cop_data_s *data_ptr = custom_data_ptr; const struct cop_data_s *data_end = (struct cop_data_s *)((ptrdiff_t)custom_data_ptr + custom_data_size); - int return_code = 0; + + // Make sure an inference is not already running + if (drv->job.state != ETHOSU_JOB_IDLE) + { + LOG_ERR("Inference already running, or waiting to be cleared..."); + return -1; + } + + drv->job.state = ETHOSU_JOB_IDLE; + drv->job.custom_data_ptr = custom_data_ptr; + drv->job.custom_data_size = custom_data_size; + drv->job.base_addr = base_addr; + drv->job.base_addr_size = base_addr_size; + drv->job.num_base_addr = num_base_addr; + drv->job.user_arg = user_arg; // First word in custom_data_ptr should contain "Custom Operator Payload 1" if (data_ptr->word != ETHOSU_FOURCC) { LOG_ERR("Custom Operator Payload: %" PRIu32 " is not correct, expected %x", data_ptr->word, ETHOSU_FOURCC); - return -1; + goto err; } // Custom data length must be a multiple of 32 bits if ((custom_data_size % BYTES_IN_32_BITS) != 0) { LOG_ERR("custom_data_size=0x%x not a multiple of 4", custom_data_size); - return -1; + goto err; } - ++data_ptr; + data_ptr++; // Adjust base address to fast memory area if (drv->fast_memory != 0 && num_base_addr >= FAST_MEMORY_BASE_ADDR_INDEX) @@ -514,53 +588,39 @@ int ethosu_invoke_v3(struct ethosu_driver *drv, LOG_ERR("Fast memory area too small. fast_memory_size=%u, base_addr_size=%u", drv->fast_memory_size, base_addr_size[FAST_MEMORY_BASE_ADDR_INDEX]); - return -1; + goto err; } *fast_memory = drv->fast_memory; } - // NPU might have lost power and thus its settings and state - if (!drv->dev_power_always_on) - { - // Set power ON during the inference. Will soft reset if security state or - // privilege level needs changing - set_clock_and_power_request(drv, ETHOSU_INFERENCE_REQUEST, ETHOSU_CLOCK_Q_ENABLE, ETHOSU_POWER_Q_DISABLE); - - // Make sure AXI settings are applied - ethosu_dev_axi_init(drv->dev); - } - drv->status_error = false; - ethosu_inference_begin(drv, user_arg); - + // Parse Custom Operator Payload data while (data_ptr < data_end) { - int ret = 0; switch (data_ptr->driver_action_command) { case OPTIMIZER_CONFIG: LOG_DEBUG("OPTIMIZER_CONFIG"); struct opt_cfg_s *opt_cfg_p = (struct opt_cfg_s *)data_ptr; - ret = handle_optimizer_config(drv, opt_cfg_p); + if (handle_optimizer_config(drv, opt_cfg_p) < 0) + { + goto err; + } data_ptr += DRIVER_ACTION_LENGTH_32_BIT_WORD + OPTIMIZER_CONFIG_LENGTH_32_BIT_WORD; break; case COMMAND_STREAM: + // Vela only supports putting one COMMAND_STREAM per op LOG_DEBUG("COMMAND_STREAM"); void *command_stream = (uint8_t *)(data_ptr) + sizeof(struct cop_data_s); int cms_length = (data_ptr->reserved << 16) | data_ptr->length; - // It is safe to clear this flag without atomic, because npu is not running. - drv->irq_triggered = false; - - ret = handle_command_stream(drv, command_stream, cms_length, base_addr, base_addr_size, num_base_addr); - if (ret < 0) + if (handle_command_stream(drv, command_stream, cms_length) < 0) { - LOG_ERR("Inference failed."); + goto err; } - data_ptr += DRIVER_ACTION_LENGTH_32_BIT_WORD + cms_length; break; case NOP: @@ -569,42 +629,58 @@ int ethosu_invoke_v3(struct ethosu_driver *drv, break; default: LOG_ERR("UNSUPPORTED driver_action_command: %d", data_ptr->driver_action_command); - ret = -1; - break; - } - if (ret != 0) - { - return_code = -1; + goto err; break; } } - ethosu_inference_end(drv, user_arg); + return 0; +err: + LOG_ERR("Failed to invoke inference."); + ethosu_reset_job(drv); + return -1; +} - if (!drv->status_error && !drv->dev_power_always_on) +int ethosu_invoke_v3(struct ethosu_driver *drv, + const void *custom_data_ptr, + const int custom_data_size, + const uint64_t *base_addr, + const size_t *base_addr_size, + const int num_base_addr, + void *user_arg) +{ + if (ethosu_invoke_async( + drv, custom_data_ptr, custom_data_size, base_addr, base_addr_size, num_base_addr, user_arg) < 0) { - set_clock_and_power_request(drv, ETHOSU_INFERENCE_REQUEST, ETHOSU_CLOCK_Q_ENABLE, ETHOSU_POWER_Q_ENABLE); + return -1; } - return return_code; + return ethosu_wait(drv, true); } void ethosu_set_power_mode(struct ethosu_driver *drv, bool always_on) { drv->dev_power_always_on = always_on; - if (always_on && ethosu_dev_verify_access_state(drv->dev) == false) + if (always_on) { - // Reset to enter correct security state/privilege mode - if (ethosu_dev_soft_reset(drv->dev) == false) + if (ethosu_dev_verify_access_state(drv->dev) == false) { - LOG_ERR("Failed to set power mode for Ethos-U"); - return; + // Reset to enter correct security state/privilege mode + if (ethosu_dev_soft_reset(drv->dev) == false) + { + LOG_ERR("Failed to set power mode for Ethos-U"); + return; + } } - } - ethosu_dev_set_clock_and_power( - drv->dev, ETHOSU_CLOCK_Q_UNCHANGED, always_on ? ETHOSU_POWER_Q_DISABLE : ETHOSU_POWER_Q_ENABLE); + ethosu_dev_set_clock_and_power(drv->dev, ETHOSU_CLOCK_Q_UNCHANGED, ETHOSU_POWER_Q_DISABLE); + ethosu_dev_axi_init(drv->dev); + } + else + { + ethosu_dev_set_clock_and_power(drv->dev, ETHOSU_CLOCK_Q_UNCHANGED, ETHOSU_POWER_Q_ENABLE); + } } struct ethosu_driver *ethosu_reserve_driver(void) @@ -635,6 +711,21 @@ void ethosu_release_driver(struct ethosu_driver *drv) ethosu_mutex_lock(ethosu_mutex); if (drv != NULL && drv->reserved) { + if (drv->job.state == ETHOSU_JOB_RUNNING || drv->job.state == ETHOSU_JOB_DONE) + { + // Give the inference one shot to complete or force kill the job + if (ethosu_wait(drv, false) == 1) + { + // Still running, soft reset the NPU and reset driver + ethosu_dev_soft_reset(drv->dev); + ethosu_reset_job(drv); + drv->status_error = false; + ethosu_semaphore_give(drv->semaphore); + (void)set_clock_and_power_request( + drv, ETHOSU_INFERENCE_REQUEST, ETHOSU_CLOCK_Q_ENABLE, ETHOSU_POWER_Q_ENABLE); + } + } + drv->reserved = false; LOG_DEBUG("NPU driver handle %p released", drv); ethosu_semaphore_give(ethosu_semaphore); -- cgit v1.2.1