summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKshitij Sisodia <kshitij.sisodia@arm.com>2022-12-20 16:09:15 +0000
committerKshitij Sisodia <kshitij.sisodia@arm.com>2022-12-21 11:51:37 +0000
commitde54e1606b21d333e126525807414455d2ff1840 (patch)
tree902d468f60b098cd91b9590db155d8fb08c06353
parent2ea46232a15aaf7600f1b92314612f4aa2fc6cd2 (diff)
downloadml-embedded-evaluation-kit-de54e1606b21d333e126525807414455d2ff1840.tar.gz
MLECO-3659: Improvement for NPU PMU counters
The NPU idle count could have been erraneously high as the counters were always running. This change utilises callback functions to start/stop the counters only when the inferences start/stop executing on the NPU. Changes have been made to cache maintenance functions called from within the NPU driver's pipeline to reduce the overhead caused by these. Change-Id: I69db0d3b3f3fe5b2847e15b5c3096cb1e0484176 Signed-off-by: Kshitij Sisodia <kshitij.sisodia@arm.com>
-rwxr-xr-xbuild_default.py1
-rw-r--r--source/hal/source/components/npu/ethosu_cpu_cache.c94
-rw-r--r--source/hal/source/components/npu/ethosu_profiler.c90
-rw-r--r--source/hal/source/components/npu/include/ethosu_cpu_cache.h5
4 files changed, 152 insertions, 38 deletions
diff --git a/build_default.py b/build_default.py
index 387e9ba..e35aa1c 100755
--- a/build_default.py
+++ b/build_default.py
@@ -122,6 +122,7 @@ def run(
+ f" -DCMAKE_TOOLCHAIN_FILE={cmake_toolchain_file}"
+ f" -DETHOS_U_NPU_ID={ethos_u_cfg.ethos_u_npu_id}"
+ f" -DETHOS_U_NPU_CONFIG_ID={ethos_u_cfg.ethos_u_config_id}"
+ + f" -DTENSORFLOW_LITE_MICRO_CLEAN_DOWNLOADS=ON"
)
logging.info(f"\n\n\n{cmake_command}\n\n\n")
diff --git a/source/hal/source/components/npu/ethosu_cpu_cache.c b/source/hal/source/components/npu/ethosu_cpu_cache.c
index 0840971..d5f5e47 100644
--- a/source/hal/source/components/npu/ethosu_cpu_cache.c
+++ b/source/hal/source/components/npu/ethosu_cpu_cache.c
@@ -21,34 +21,104 @@
#include "ethosu_driver.h" /* Arm Ethos-U driver header */
#include "log_macros.h" /* Logging macros */
+/** Structure to maintain data cache states. */
+typedef struct _cpu_cache_state {
+ uint32_t dcache_invalidated : 1;
+ uint32_t dcache_cleaned : 1;
+} cpu_cache_state;
+
+/** Static CPU cache state object.
+ * @note This logic around flipping these states is based on the driver
+ * calling the functions in this sequence:
+ *
+ * Cache flush (ethosu_flush_dcache)
+ * ↓
+ * Start inference (ethosu_inference_begin)
+ * ↓
+ * Inference (ethosu_dev_run_command_stream)
+ * ↓
+ * End inference (ethosu_inference_end)
+ * ↓
+ * Cache invalidate (ethosu_dcache_invalidate)
+ **/
+static cpu_cache_state s_cache_state = {.dcache_cleaned = 0, .dcache_invalidated = 0};
+
+/**
+ * @brief Gets the current CPU cache state.
+ * @return Pointer to the CPU cache state object.
+ */
+static cpu_cache_state* ethosu_get_cpu_cache_state(void)
+{
+ return &s_cache_state;
+}
+
+void ethosu_clear_cache_states(void)
+{
+ cpu_cache_state* const state = ethosu_get_cpu_cache_state();
+ trace("Clearing cache state members\n");
+ state->dcache_invalidated = 0;
+ state->dcache_cleaned = 0;
+}
+
void ethosu_flush_dcache(uint32_t *p, size_t bytes)
{
+ UNUSED(p);
+ UNUSED(bytes);
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
+ cpu_cache_state* const state = ethosu_get_cpu_cache_state();
if (SCB->CCR & SCB_CCR_DC_Msk) {
- if (p) {
- SCB_CleanDCache_by_Addr((void *) p, (int32_t) bytes);
- } else {
+
+ /**
+ * @note We could choose to call the `SCB_CleanDCache_by_Addr` function
+ * here, but the sizes which this function is called for, can
+ * cause unnecessary delays. It's worth noting that this function
+ * is called from the Arm Ethos-U NPU drive repeatedly for each
+ * region it accesses. This could even be RO memory which does
+ * not need cache maintenance, along with parts of the input and
+ * output tensors which rightly need to be cleaned. Therefore, to
+ * reduce overhead of repeated calls for large memory sizes, we
+ * call the clean and invalidation functions for whole cache.
+ *
+ * If the neural network to be executed is completely falling
+ * onto the NPU, consider disabling the data cache altogether
+ * for the duration of the inference to further reduce the cache
+ * maintenance burden in these functions.
+ */
+
+ /** Clean the cache if it hasn't been cleaned already */
+ if (!state->dcache_cleaned) {
+ trace("Cleaning data cache\n");
SCB_CleanDCache();
+
+ /** Assert the cache cleaned state and clear the invalidation
+ * state. */
+ state->dcache_cleaned = 1;
+ state->dcache_invalidated = 0;
}
}
-#else
- UNUSED(p);
- UNUSED(bytes);
#endif /* defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) */
}
void ethosu_invalidate_dcache(uint32_t *p, size_t bytes)
{
+ UNUSED(p);
+ UNUSED(bytes);
#if defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U)
+ cpu_cache_state* const state = ethosu_get_cpu_cache_state();
if (SCB->CCR & SCB_CCR_DC_Msk) {
- if (p) {
- SCB_InvalidateDCache_by_Addr((void *) p, (int32_t) bytes);
- } else {
+ /**
+ * See note in ethosu_flush_dcache function for why we clean the whole
+ * cache instead of calling it for specific addresses.
+ **/
+ if (!state->dcache_invalidated) {
+ trace("Invalidating data cache\n");
SCB_InvalidateDCache();
+
+ /** Assert the cache invalidation state and clear the clean
+ * state. */
+ state->dcache_invalidated = 1;
+ state->dcache_cleaned = 0;
}
}
-#else
- UNUSED(p);
- UNUSED(bytes);
#endif /* defined (__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) */
}
diff --git a/source/hal/source/components/npu/ethosu_profiler.c b/source/hal/source/components/npu/ethosu_profiler.c
index b3f93da..dea704c 100644
--- a/source/hal/source/components/npu/ethosu_profiler.c
+++ b/source/hal/source/components/npu/ethosu_profiler.c
@@ -1,6 +1,6 @@
/*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates
+ * <open-source-office@arm.com> SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,13 +16,16 @@
*/
#include "ethosu_profiler.h"
+#include "ethosu_cpu_cache.h"
#include "log_macros.h"
#include <string.h>
-extern struct ethosu_driver ethosu_drv; /* Default Arm Ethos-U NPU device driver object */
-static ethosu_pmu_counters npu_counters; /* NPU counter local instance */
-static const char* unit_beats = "beats";
+extern struct ethosu_driver ethosu_drv; /* Default Arm Ethos-U NPU device driver object */
+static ethosu_pmu_counters s_npu_counters; /* NPU counter local instance */
+static uint32_t s_evt_mask = 0; /* PMU event mask */
+
+static const char* unit_beats = "beats";
static const char* unit_cycles = "cycles";
/**
@@ -31,7 +34,26 @@ static const char* unit_cycles = "cycles";
*/
static ethosu_pmu_counters* get_counter_instance(void)
{
- return &npu_counters;
+ return &s_npu_counters;
+}
+
+/**
+ * @brief Gets the enabled event mask from the PMU driver.
+ * @return Event mask as an unsigned 32 bit integer.
+ */
+static uint32_t get_event_mask(void)
+{
+ return s_evt_mask;
+}
+
+/**
+ * @brief Sets the enabled event mask for the PMU driver.
+ * @param[in] mask event mask as an unsigned 32 bit integer.
+ * @return none.
+ */
+static void set_event_mask(uint32_t mask)
+{
+ s_evt_mask = mask;
}
/**
@@ -49,8 +71,8 @@ static bool counter_overflow(uint32_t pmu_counter_mask)
void ethosu_pmu_init(void)
{
- uint32_t i = 0;
- uint32_t evt_mask = ETHOSU_PMU_CCNT_Msk;
+ uint32_t i = 0;
+ uint32_t evt_mask = ETHOSU_PMU_CCNT_Msk;
ethosu_pmu_counters* counters = get_counter_instance();
memset(counters, 0, sizeof(*counters));
@@ -58,31 +80,31 @@ void ethosu_pmu_init(void)
counters->num_total_counters = ETHOSU_PROFILER_NUM_COUNTERS;
#if ETHOSU_PMU_NCOUNTERS >= 4
- counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_IDLE;
+ counters->npu_evt_counters[0].event_type = ETHOSU_PMU_NPU_ACTIVE;
counters->npu_evt_counters[0].event_mask = ETHOSU_PMU_CNT1_Msk;
- counters->npu_evt_counters[0].name = "NPU IDLE";
- counters->npu_evt_counters[0].unit = unit_cycles;
+ counters->npu_evt_counters[0].name = "NPU ACTIVE";
+ counters->npu_evt_counters[0].unit = unit_cycles;
counters->npu_evt_counters[1].event_type = ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED;
counters->npu_evt_counters[1].event_mask = ETHOSU_PMU_CNT2_Msk;
- counters->npu_evt_counters[1].name = "NPU AXI0_RD_DATA_BEAT_RECEIVED";
- counters->npu_evt_counters[1].unit = unit_beats;
+ counters->npu_evt_counters[1].name = "NPU AXI0_RD_DATA_BEAT_RECEIVED";
+ counters->npu_evt_counters[1].unit = unit_beats;
counters->npu_evt_counters[2].event_type = ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN;
counters->npu_evt_counters[2].event_mask = ETHOSU_PMU_CNT3_Msk;
- counters->npu_evt_counters[2].name = "NPU AXI0_WR_DATA_BEAT_WRITTEN";
- counters->npu_evt_counters[2].unit = unit_beats;
+ counters->npu_evt_counters[2].name = "NPU AXI0_WR_DATA_BEAT_WRITTEN";
+ counters->npu_evt_counters[2].unit = unit_beats;
counters->npu_evt_counters[3].event_type = ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED;
counters->npu_evt_counters[3].event_mask = ETHOSU_PMU_CNT4_Msk;
- counters->npu_evt_counters[3].name = "NPU AXI1_RD_DATA_BEAT_RECEIVED";
- counters->npu_evt_counters[3].unit = unit_beats;
+ counters->npu_evt_counters[3].name = "NPU AXI1_RD_DATA_BEAT_RECEIVED";
+ counters->npu_evt_counters[3].unit = unit_beats;
#else /* ETHOSU_PMU_NCOUNTERS >= 4 */
- #error "NPU PMU expects a minimum of 4 available event triggered counters!"
+#error "NPU PMU expects a minimum of 4 available event triggered counters!"
#endif /* ETHOSU_PMU_NCOUNTERS >= 4 */
#if ETHOSU_DERIVED_NCOUNTERS >= 1
- counters->npu_derived_counters[0].name = "NPU ACTIVE";
+ counters->npu_derived_counters[0].name = "NPU IDLE";
counters->npu_derived_counters[0].unit = unit_cycles;
#endif /* ETHOSU_DERIVED_NCOUNTERS >= 1 */
@@ -91,16 +113,17 @@ void ethosu_pmu_init(void)
evt_mask |= counters->npu_evt_counters[i].event_mask;
}
+ set_event_mask(evt_mask);
+
/* Reset overflow status. */
- ETHOSU_PMU_Set_CNTR_OVS(&ethosu_drv, evt_mask);
+ ETHOSU_PMU_Set_CNTR_OVS(&ethosu_drv, get_event_mask());
/* Enable PMU. */
ETHOSU_PMU_Enable(&ethosu_drv);
/* Enable counters for cycle and event counters. */
- ETHOSU_PMU_CNTR_Disable(&ethosu_drv, evt_mask);
+ ETHOSU_PMU_CNTR_Disable(&ethosu_drv, get_event_mask());
ethosu_pmu_reset_counters();
- ETHOSU_PMU_CNTR_Enable(&ethosu_drv, evt_mask);
}
/**
@@ -120,15 +143,14 @@ void ethosu_pmu_reset_counters(void)
ethosu_pmu_counters ethosu_get_pmu_counters(void)
{
ethosu_pmu_counters* counters = get_counter_instance();
- uint32_t i = 0;
+ uint32_t i = 0;
/* Event counters */
for (i = 0; i < ETHOSU_PMU_NCOUNTERS; ++i) {
if (counter_overflow(counters->npu_evt_counters[i].event_mask)) {
warn("Counter overflow detected for %s.\n", counters->npu_evt_counters[i].name);
}
- counters->npu_evt_counters[i].counter_value =
- ETHOSU_PMU_Get_EVCNTR(&ethosu_drv, i);
+ counters->npu_evt_counters[i].counter_value = ETHOSU_PMU_Get_EVCNTR(&ethosu_drv, i);
}
/* Total cycle count */
@@ -136,7 +158,9 @@ ethosu_pmu_counters ethosu_get_pmu_counters(void)
/* Derived counters */
#if ETHOSU_DERIVED_NCOUNTERS >= 1
- if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_IDLE) {
+ if (counters->npu_evt_counters[0].event_type == ETHOSU_PMU_NPU_ACTIVE) {
+
+ /* Compute the idle count */
counters->npu_derived_counters[0].counter_value =
counters->npu_total_ccnt - counters->npu_evt_counters[0].counter_value;
}
@@ -144,3 +168,17 @@ ethosu_pmu_counters ethosu_get_pmu_counters(void)
return *counters;
}
+
+void ethosu_inference_begin(struct ethosu_driver* drv, void* userArg)
+{
+ UNUSED(userArg);
+ ethosu_clear_cache_states();
+ ETHOSU_PMU_CNTR_Disable(drv, get_event_mask());
+ ETHOSU_PMU_CNTR_Enable(drv, get_event_mask());
+}
+
+void ethosu_inference_end(struct ethosu_driver* drv, void* userArg)
+{
+ UNUSED(userArg);
+ ETHOSU_PMU_CNTR_Disable(drv, get_event_mask());
+}
diff --git a/source/hal/source/components/npu/include/ethosu_cpu_cache.h b/source/hal/source/components/npu/include/ethosu_cpu_cache.h
index faf26c2..d5de3d5 100644
--- a/source/hal/source/components/npu/include/ethosu_cpu_cache.h
+++ b/source/hal/source/components/npu/include/ethosu_cpu_cache.h
@@ -21,6 +21,11 @@
#include <stddef.h>
/**
+ * @brief Clears all the cache state members.
+ */
+void ethosu_clear_cache_states(void);
+
+/**
* @brief Flush/clean the data cache by address and size. Passing NULL as p argument
* expects the whole cache to be flushed.
* @param[in] p Pointer to the start address.