From de54e1606b21d333e126525807414455d2ff1840 Mon Sep 17 00:00:00 2001 From: Kshitij Sisodia Date: Tue, 20 Dec 2022 16:09:15 +0000 Subject: MLECO-3659: Improvement for NPU PMU counters The NPU idle count could have been erraneously high as the counters were always running. This change utilises callback functions to start/stop the counters only when the inferences start/stop executing on the NPU. Changes have been made to cache maintenance functions called from within the NPU driver's pipeline to reduce the overhead caused by these. Change-Id: I69db0d3b3f3fe5b2847e15b5c3096cb1e0484176 Signed-off-by: Kshitij Sisodia --- build_default.py | 1 + .../hal/source/components/npu/ethosu_cpu_cache.c | 94 +++++++++++++++++++--- source/hal/source/components/npu/ethosu_profiler.c | 90 +++++++++++++++------ .../components/npu/include/ethosu_cpu_cache.h | 5 ++ 4 files changed, 152 insertions(+), 38 deletions(-) diff --git a/build_default.py b/build_default.py index 387e9ba..e35aa1c 100755 --- a/build_default.py +++ b/build_default.py @@ -122,6 +122,7 @@ def run( + f" -DCMAKE_TOOLCHAIN_FILE={cmake_toolchain_file}" + f" -DETHOS_U_NPU_ID={ethos_u_cfg.ethos_u_npu_id}" + f" -DETHOS_U_NPU_CONFIG_ID={ethos_u_cfg.ethos_u_config_id}" + + f" -DTENSORFLOW_LITE_MICRO_CLEAN_DOWNLOADS=ON" ) logging.info(f"\n\n\n{cmake_command}\n\n\n") diff --git a/source/hal/source/components/npu/ethosu_cpu_cache.c b/source/hal/source/components/npu/ethosu_cpu_cache.c index 0840971..d5f5e47 100644 --- a/source/hal/source/components/npu/ethosu_cpu_cache.c +++ b/source/hal/source/components/npu/ethosu_cpu_cache.c @@ -21,34 +21,104 @@ #include "ethosu_driver.h" /* Arm Ethos-U driver header */ #include "log_macros.h" /* Logging macros */ +/** Structure to maintain data cache states. */ +typedef struct _cpu_cache_state { + uint32_t dcache_invalidated : 1; + uint32_t dcache_cleaned : 1; +} cpu_cache_state; + +/** Static CPU cache state object. + * @note This logic around flipping these states is based on the driver + * calling the functions in this sequence: + * + * Cache flush (ethosu_flush_dcache) + * ↓ + * Start inference (ethosu_inference_begin) + * ↓ + * Inference (ethosu_dev_run_command_stream) + * ↓ + * End inference (ethosu_inference_end) + * ↓ + * Cache invalidate (ethosu_dcache_invalidate) + **/ +static cpu_cache_state s_cache_state = {.dcache_cleaned = 0, .dcache_invalidated = 0}; + +/** + * @brief Gets the current CPU cache state. + * @return Pointer to the CPU cache state object. + */ +static cpu_cache_state* ethosu_get_cpu_cache_state(void) +{ + return &s_cache_state; +} + +void ethosu_clear_cache_states(void) +{ + cpu_cache_state* const state = ethosu_get_cpu_cache_state(); + trace("Clearing cache state members\n"); + state->dcache_invalidated = 0; + state->dcache_cleaned = 0; +} + void ethosu_flush_dcache(uint32_t *p, size_t bytes) { + UNUSED(p); + UNUSED(bytes); #if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) + cpu_cache_state* const state = ethosu_get_cpu_cache_state(); if (SCB->CCR & SCB_CCR_DC_Msk) { - if (p) { - SCB_CleanDCache_by_Addr((void *) p, (int32_t) bytes); - } else { + + /** + * @note We could choose to call the `SCB_CleanDCache_by_Addr` function + * here, but the sizes which this function is called for, can + * cause unnecessary delays. It's worth noting that this function + * is called from the Arm Ethos-U NPU drive repeatedly for each + * region it accesses. This could even be RO memory which does + * not need cache maintenance, along with parts of the input and + * output tensors which rightly need to be cleaned. Therefore, to + * reduce overhead of repeated calls for large memory sizes, we + * call the clean and invalidation functions for whole cache. + * + * If the neural network to be executed is completely falling + * onto the NPU, consider disabling the data cache altogether + * for the duration of the inference to further reduce the cache + * maintenance burden in these functions. + */ + + /** Clean the cache if it hasn't been cleaned already */ + if (!state->dcache_cleaned) { + trace("Cleaning data cache\n"); SCB_CleanDCache(); + + /** Assert the cache cleaned state and clear the invalidation + * state. */ + state->dcache_cleaned = 1; + state->dcache_invalidated = 0; } } -#else - UNUSED(p); - UNUSED(bytes); #endif /* defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) */ } void ethosu_invalidate_dcache(uint32_t *p, size_t bytes) { + UNUSED(p); + UNUSED(bytes); #if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) + cpu_cache_state* const state = ethosu_get_cpu_cache_state(); if (SCB->CCR & SCB_CCR_DC_Msk) { - if (p) { - SCB_InvalidateDCache_by_Addr((void *) p, (int32_t) bytes); - } else { + /** + * See note in ethosu_flush_dcache function for why we clean the whole + * cache instead of calling it for specific addresses. + **/ + if (!state->dcache_invalidated) { + trace("Invalidating data cache\n"); SCB_InvalidateDCache(); + + /** Assert the cache invalidation state and clear the clean + * state. */ + state->dcache_invalidated = 1; + state->dcache_cleaned = 0; } } -#else - UNUSED(p); - UNUSED(bytes); #endif /* defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) */ } diff --git a/source/hal/source/components/npu/ethosu_profiler.c b/source/hal/source/components/npu/ethosu_profiler.c index b3f93da..dea704c 100644 --- a/source/hal/source/components/npu/ethosu_profiler.c +++ b/source/hal/source/components/npu/ethosu_profiler.c @@ -1,6 +1,6 @@ /* - * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates - * SPDX-License-Identifier: Apache-2.0 + * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates + * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,16 @@ */ #include "ethosu_profiler.h" +#include "ethosu_cpu_cache.h" #include "log_macros.h" #include -extern struct ethosu_driver ethosu_drv; /* Default Arm Ethos-U NPU device driver object */ -static ethosu_pmu_counters npu_counters; /* NPU counter local instance */ -static const char* unit_beats = "beats"; +extern struct ethosu_driver ethosu_drv; /* Default Arm Ethos-U NPU device driver object */ +static ethosu_pmu_counters s_npu_counters; /* NPU counter local instance */ +static uint32_t s_evt_mask = 0; /* PMU event mask */ + +static const char* unit_beats = "beats"; static const char* unit_cycles = "cycles"; /** @@ -31,7 +34,26 @@ static const char* unit_cycles = "cycles"; */ static ethosu_pmu_counters* get_counter_instance(void) { - return &npu_counters; + return &s_npu_counters; +} + +/** + * @brief Gets the enabled event mask from the PMU driver. + * @return Event mask as an unsigned 32 bit integer. + */ +static uint32_t get_event_mask(void) +{ + return s_evt_mask; +} + +/** + * @brief Sets the enabled event mask for the PMU driver. + * @param[in] mask event mask as an unsigned 32 bit integer. + * @return none. + */ +static void set_event_mask(uint32_t mask) +{ + s_evt_mask = mask; } /** @@ -49,8 +71,8 @@ static bool counter_overflow(uint32_t pmu_counter_mask) void ethosu_pmu_init(void) { - uint32_t i = 0; - uint32_t evt_mask = ETHOSU_PMU_CCNT_Msk; + uint32_t i = 0; + uint32_t evt_mask = ETHOSU_PMU_CCNT_Msk; ethosu_pmu_counters* counters = get_counter_instance(); memset(counters, 0, sizeof(*counters)); @@ -58,31 +80,31 @@ void ethosu_pmu_init(void) counters->num_total_counters = ETHOSU_PROFILER_NUM_COUNTERS; #if ETHOSU_PMU_NCOUNTERS >= 4 - counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_IDLE; + counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_ACTIVE; counters->npu_evt_counters[0].event_mask = ETHOSU_PMU_CNT1_Msk; - counters->npu_evt_counters[0].name = "NPU IDLE"; - counters->npu_evt_counters[0].unit = unit_cycles; + counters->npu_evt_counters[0].name = "NPU ACTIVE"; + counters->npu_evt_counters[0].unit = unit_cycles; counters->npu_evt_counters[1].event_type = ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED; counters->npu_evt_counters[1].event_mask = ETHOSU_PMU_CNT2_Msk; - counters->npu_evt_counters[1].name = "NPU AXI0_RD_DATA_BEAT_RECEIVED"; - counters->npu_evt_counters[1].unit = unit_beats; + counters->npu_evt_counters[1].name = "NPU AXI0_RD_DATA_BEAT_RECEIVED"; + counters->npu_evt_counters[1].unit = unit_beats; counters->npu_evt_counters[2].event_type = ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN; counters->npu_evt_counters[2].event_mask = ETHOSU_PMU_CNT3_Msk; - counters->npu_evt_counters[2].name = "NPU AXI0_WR_DATA_BEAT_WRITTEN"; - counters->npu_evt_counters[2].unit = unit_beats; + counters->npu_evt_counters[2].name = "NPU AXI0_WR_DATA_BEAT_WRITTEN"; + counters->npu_evt_counters[2].unit = unit_beats; counters->npu_evt_counters[3].event_type = ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED; counters->npu_evt_counters[3].event_mask = ETHOSU_PMU_CNT4_Msk; - counters->npu_evt_counters[3].name = "NPU AXI1_RD_DATA_BEAT_RECEIVED"; - counters->npu_evt_counters[3].unit = unit_beats; + counters->npu_evt_counters[3].name = "NPU AXI1_RD_DATA_BEAT_RECEIVED"; + counters->npu_evt_counters[3].unit = unit_beats; #else /* ETHOSU_PMU_NCOUNTERS >= 4 */ - #error "NPU PMU expects a minimum of 4 available event triggered counters!" +#error "NPU PMU expects a minimum of 4 available event triggered counters!" #endif /* ETHOSU_PMU_NCOUNTERS >= 4 */ #if ETHOSU_DERIVED_NCOUNTERS >= 1 - counters->npu_derived_counters[0].name = "NPU ACTIVE"; + counters->npu_derived_counters[0].name = "NPU IDLE"; counters->npu_derived_counters[0].unit = unit_cycles; #endif /* ETHOSU_DERIVED_NCOUNTERS >= 1 */ @@ -91,16 +113,17 @@ void ethosu_pmu_init(void) evt_mask |= counters->npu_evt_counters[i].event_mask; } + set_event_mask(evt_mask); + /* Reset overflow status. */ - ETHOSU_PMU_Set_CNTR_OVS(ðosu_drv, evt_mask); + ETHOSU_PMU_Set_CNTR_OVS(ðosu_drv, get_event_mask()); /* Enable PMU. */ ETHOSU_PMU_Enable(ðosu_drv); /* Enable counters for cycle and event counters. */ - ETHOSU_PMU_CNTR_Disable(ðosu_drv, evt_mask); + ETHOSU_PMU_CNTR_Disable(ðosu_drv, get_event_mask()); ethosu_pmu_reset_counters(); - ETHOSU_PMU_CNTR_Enable(ðosu_drv, evt_mask); } /** @@ -120,15 +143,14 @@ void ethosu_pmu_reset_counters(void) ethosu_pmu_counters ethosu_get_pmu_counters(void) { ethosu_pmu_counters* counters = get_counter_instance(); - uint32_t i = 0; + uint32_t i = 0; /* Event counters */ for (i = 0; i < ETHOSU_PMU_NCOUNTERS; ++i) { if (counter_overflow(counters->npu_evt_counters[i].event_mask)) { warn("Counter overflow detected for %s.\n", counters->npu_evt_counters[i].name); } - counters->npu_evt_counters[i].counter_value = - ETHOSU_PMU_Get_EVCNTR(ðosu_drv, i); + counters->npu_evt_counters[i].counter_value = ETHOSU_PMU_Get_EVCNTR(ðosu_drv, i); } /* Total cycle count */ @@ -136,7 +158,9 @@ ethosu_pmu_counters ethosu_get_pmu_counters(void) /* Derived counters */ #if ETHOSU_DERIVED_NCOUNTERS >= 1 - if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_IDLE) { + if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_ACTIVE) { + + /* Compute the idle count */ counters->npu_derived_counters[0].counter_value = counters->npu_total_ccnt - counters->npu_evt_counters[0].counter_value; } @@ -144,3 +168,17 @@ ethosu_pmu_counters ethosu_get_pmu_counters(void) return *counters; } + +void ethosu_inference_begin(struct ethosu_driver* drv, void* userArg) +{ + UNUSED(userArg); + ethosu_clear_cache_states(); + ETHOSU_PMU_CNTR_Disable(drv, get_event_mask()); + ETHOSU_PMU_CNTR_Enable(drv, get_event_mask()); +} + +void ethosu_inference_end(struct ethosu_driver* drv, void* userArg) +{ + UNUSED(userArg); + ETHOSU_PMU_CNTR_Disable(drv, get_event_mask()); +} diff --git a/source/hal/source/components/npu/include/ethosu_cpu_cache.h b/source/hal/source/components/npu/include/ethosu_cpu_cache.h index faf26c2..d5de3d5 100644 --- a/source/hal/source/components/npu/include/ethosu_cpu_cache.h +++ b/source/hal/source/components/npu/include/ethosu_cpu_cache.h @@ -20,6 +20,11 @@ #include #include +/** + * @brief Clears all the cache state members. + */ +void ethosu_clear_cache_states(void); + /** * @brief Flush/clean the data cache by address and size. Passing NULL as p argument * expects the whole cache to be flushed. -- cgit v1.2.1