diff options
34 files changed, 839 insertions, 547 deletions
diff --git a/docs/sections/customizing.md b/docs/sections/customizing.md index e92c327..346a34c 100644 --- a/docs/sections/customizing.md +++ b/docs/sections/customizing.md @@ -243,12 +243,14 @@ Profiler is a helper class assisting in collection of timings and Ethos-U55 cycle counts for operations. It uses platform timer to get system timing information. -| Method name | Description | -|----------------------|-----------------------------------------------------------| -| StartProfiling | Starts profiling and records the starting timing data. | -| StopProfiling | Stops profiling and records the ending timing data. | -| Reset | Resets the profiler and clears all collected data. | -| GetResultsAndReset | Gets the results as string and resets the profiler. | +| Method name | Description | +|-------------------------|----------------------------------------------------------------| +| StartProfiling | Starts profiling and records the starting timing data. | +| StopProfiling | Stops profiling and records the ending timing data. | +| StopProfilingAndReset | Stops the profiling and internally resets the platform timers. | +| Reset | Resets the profiler and clears all collected data. | +| GetAllResultsAndReset | Gets the results as string and resets the profiler. | +| SetName | Set the profiler name. | Usage example: @@ -259,7 +261,7 @@ profiler.StartProfiling(); // Code running inference to profile profiler.StopProfiling(); -info("%s\n", profiler.GetResultsAndReset().c_str()); +profiler.PrintProfilingResult(); ``` ## NN Model API @@ -571,9 +573,8 @@ Profiler profiler{&platform, "Inference"}; profiler.StartProfiling(); model.RunInference(); profiler.StopProfiling(); -std::string profileResults = profiler.GetResultsAndReset(); -info("%s\n", profileResults.c_str()); +profiler.PrintProfilingResult(); ``` ## Printing to console diff --git a/docs/sections/deployment.md b/docs/sections/deployment.md index 354d30b..3d5796f 100644 --- a/docs/sections/deployment.md +++ b/docs/sections/deployment.md @@ -267,13 +267,13 @@ off. 7. On the second serial port, output similar to section 2.2 should be visible: ```log - [INFO] Setting up system tick IRQ (for NPU) - [INFO] V2M-MPS3 revision C - [INFO] Application Note AN540, Revision B - [INFO] FPGA build 1 - [INFO] Core clock has been set to: 32000000 Hz - [INFO] CPU ID: 0x410fd220 - [INFO] CPU: Cortex-M55 r0p0 + INFO - Setting up system tick IRQ (for NPU) + INFO - V2M-MPS3 revision C + INFO - Application Note AN540, Revision B + INFO - FPGA build 1 + INFO - Core clock has been set to: 32000000 Hz + INFO - CPU ID: 0x410fd220 + INFO - CPU: Cortex-M55 r0p0 ... ``` diff --git a/docs/sections/testing_benchmarking.md b/docs/sections/testing_benchmarking.md index 43bb7f4..0c7c675 100644 --- a/docs/sections/testing_benchmarking.md +++ b/docs/sections/testing_benchmarking.md @@ -45,8 +45,8 @@ dev_ethosu_eval-<use_case>-tests ``` ```log -[INFO] native platform initialised -[INFO] ARM Ethos-U55 Evaluation application for MPS3 FPGA Prototyping Board and FastModel +INFO - native platform initialised +INFO - ARM Ethos-U55 Evaluation application for MPS3 FPGA Prototyping Board and FastModel ... =============================================================================== diff --git a/docs/use_cases/ad.md b/docs/use_cases/ad.md index ca95af8..1ff9c4f 100644 --- a/docs/use_cases/ad.md +++ b/docs/use_cases/ad.md @@ -390,41 +390,39 @@ Choice: 4. “Show NN model info” menu option prints information about model data type, input and output tensor sizes: ```log - [INFO] uTFL version: 2.5.0 - [INFO] Model info: - [INFO] Model INPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 1024 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 32 - [INFO] 2: 32 - [INFO] 3: 1 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.192437 - [INFO] ZeroPoint[0] = 11 - [INFO] Model OUTPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 8 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 8 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.048891 - [INFO] ZeroPoint[0] = -30 - [INFO] Activation buffer (a.k.a tensor arena) size used: 198016 - [INFO] Number of operators: 1 - [INFO] Operator 0: ethos-u - [INFO] Use of Arm uNPU is enabled - + INFO - uTFL version: 2.5.0 + INFO - Model info: + INFO - Model INPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 1024 bytes with dimensions + INFO - 0: 1 + INFO - 1: 32 + INFO - 2: 32 + INFO - 3: 1 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.192437 + INFO - ZeroPoint[0] = 11 + INFO - Model OUTPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 8 bytes with dimensions + INFO - 0: 1 + INFO - 1: 8 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.048891 + INFO - ZeroPoint[0] = -30 + INFO - Activation buffer (a.k.a tensor arena) size used: 198016 + INFO - Number of operators: 1 + INFO - Operator 0: ethos-u ``` 5. “List” menu option prints a list of pair ... indexes - the original filenames embedded in the application: ```log - [INFO] List of Files: - [INFO] 0 =>; anomaly_id_00_00000000.wav - [INFO] 1 =>; anomaly_id_02_00000076.wav - [INFO] 2 =>; normal_id_00_00000004.wav - [INFO] 3 =>; normal_id_02_00000001.wav + INFO - List of Files: + INFO - 0 =>; anomaly_id_00_00000000.wav + INFO - 1 =>; anomaly_id_02_00000076.wav + INFO - 2 =>; normal_id_00_00000004.wav + INFO - 3 =>; normal_id_02_00000001.wav ``` ### Running Anomaly Detection @@ -434,76 +432,30 @@ Please select the first menu option to execute Anomaly Detection. The following example illustrates application output: ```log -[INFO] Running inference on audio clip 0 => anomaly_id_00_00000000.wav -[INFO] Inference 1/13 -[INFO] Profile for Inference: - Active NPU cycles: 1081154 - Idle NPU cycles: 1012 - -[INFO] Inference 2/13 -[INFO] Profile for Inference: - Active NPU cycles: 1080934 - Idle NPU cycles: 232 - -[INFO] Inference 3/13 -[INFO] Profile for Inference: - Active NPU cycles: 1081332 - Idle NPU cycles: 834 - -[INFO] Inference 4/13 -[INFO] Profile for Inference: - Active NPU cycles: 1080748 - Idle NPU cycles: 418 - -[INFO] Inference 5/13 -[INFO] Profile for Inference: - Active NPU cycles: 1080728 - Idle NPU cycles: 438 - -[INFO] Inference 6/13 -[INFO] Profile for Inference: - Active NPU cycles: 1081144 - Idle NPU cycles: 1022 - -[INFO] Inference 7/13 -[INFO] Profile for Inference: - Active NPU cycles: 1080924 - Idle NPU cycles: 242 - -[INFO] Inference 8/13 -[INFO] Profile for Inference: - Active NPU cycles: 1081322 - Idle NPU cycles: 844 - -[INFO] Inference 9/13 -[INFO] Profile for Inference: - Active NPU cycles: 1080738 - Idle NPU cycles: 428 - -[INFO] Inference 10/13 -[INFO] Profile for Inference: - Active NPU cycles: 1080718 - Idle NPU cycles: 448 - -[INFO] Inference 11/13 -[INFO] Profile for Inference: - Active NPU cycles: 1081134 - Idle NPU cycles: 1032 - -[INFO] Inference 12/13 -[INFO] Profile for Inference: - Active NPU cycles: 1080914 - Idle NPU cycles: 252 - -[INFO] Inference 13/13 -[INFO] Profile for Inference: - Active NPU cycles: 1081312 - Idle NPU cycles: 854 - -[INFO] Average anomaly score is: -0.024493 +INFO - Running inference on audio clip 0 => anomaly_id_00_00000000.wav +INFO - Inference 1/13 +INFO - Inference 2/13 +INFO - Inference 3/13 +INFO - Inference 4/13 +INFO - Inference 5/13 +INFO - Inference 6/13 +INFO - Inference 7/13 +INFO - Inference 8/13 +INFO - Inference 9/13 +INFO - Inference 10/13 +INFO - Inference 11/13 +INFO - Inference 12/13 +INFO - Inference 13/13 +INFO - Average anomaly score is: -0.024493 Anomaly threshold is: -0.800000 Anomaly detected! - +INFO - Profile for Inference: +INFO - NPU AXI0_RD_DATA_BEAT_RECEIVED cycles: 628122 +INFO - NPU AXI0_WR_DATA_BEAT_WRITTEN cycles: 135087 +INFO - NPU AXI1_RD_DATA_BEAT_RECEIVED cycles: 62870 +INFO - NPU ACTIVE cycles: 1081007 +INFO - NPU IDLE cycles: 626 +INFO - NPU total cycles: 1081634 ``` As multiple inferences have to be run for one clip it will take around a minute or so for all inferences to complete. @@ -515,9 +467,19 @@ The profiling section of the log shows that for each inference. For the last inf - Ethos-U55's PMU report: - - 1,081,312 active cycles: number of cycles that were used for computation + - 1,081,634 total cycle: The number of NPU cycles + + - 1,081,007 active cycles: number of NPU cycles that were used for computation + + - 626 idle cycles: number of cycles for which the NPU was idle + + - 628,122 AXI0 read cycles: The number of cycles the NPU spends to execute AXI0 read transactions. + AXI0 is the bus where Ethos-U55 NPU reads and writes to the computation buffers (activation buf/tensor arenas). + + - 135,087 AXI0 write cycles: The number of cycles the NPU spends to execute AXI0 write transactions. - - 854 idle cycles: number of cycles for which the NPU was idle + - 62,870 AXI1 read cycles: The number of cycles the NPU spends to execute AXI1 read transactions. + AXI1 is the bus where Ethos-U55 NPU reads the model (read only) - For FPGA platforms, CPU cycle count can also be enabled. For FVP, however, CPU cycle counters should not be used as the CPU model is not cycle-approximate or cycle-accurate. diff --git a/docs/use_cases/asr.md b/docs/use_cases/asr.md index d224aca..4600698 100644 --- a/docs/use_cases/asr.md +++ b/docs/use_cases/asr.md @@ -431,7 +431,7 @@ Choice: compiled audio. > **Note:** Note that if the clip is over a certain length, the application will invoke multiple inference runs to ->cover the entire file. + >cover the entire file. 2. “Classify audio clip at chosen index” menu option will run inference on the chosen audio clip. @@ -444,41 +444,40 @@ Choice: 4. “Show NN model info” menu option prints information about model data type, input and output tensor sizes: ```log - [INFO] uTFL version: 2.5.0 - [INFO] Model info: - [INFO] Model INPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 11544 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 296 - [INFO] 2: 39 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.110316 - [INFO] ZeroPoint[0] = -11 - [INFO] Model OUTPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 4292 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 1 - [INFO] 2: 148 - [INFO] 3: 29 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.003906 - [INFO] ZeroPoint[0] = -128 - [INFO] Activation buffer (a.k.a tensor arena) size used: 783168 - [INFO] Number of operators: 1 - [INFO] Operator 0: ethos-u - [INFO] Use of Arm uNPU is enabled + INFO - uTFL version: 2.5.0 + INFO - Model info: + INFO - Model INPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 11544 bytes with dimensions + INFO - 0: 1 + INFO - 1: 296 + INFO - 2: 39 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.110316 + INFO - ZeroPoint[0] = -11 + INFO - Model OUTPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 4292 bytes with dimensions + INFO - 0: 1 + INFO - 1: 1 + INFO - 2: 148 + INFO - 3: 29 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.003906 + INFO - ZeroPoint[0] = -128 + INFO - Activation buffer (a.k.a tensor arena) size used: 783168 + INFO - Number of operators: 1 + INFO - Operator 0: ethos-u ``` 5. “List” menu option prints a list of pair audio clip indexes - the original filenames embedded in the application: ```log - [INFO] List of Files: - [INFO] 0 => anotherdoor.wav - [INFO] 1 => anotherengineer.wav - [INFO] 2 => itellyou.wav - [INFO] 3 => testingroutine.wav + INFO - List of Files: + INFO - 0 => anotherdoor.wav + INFO - 1 => anotherengineer.wav + INFO - 2 => itellyou.wav + INFO - 3 => testingroutine.wav ``` ### Running Automatic Speech Recognition @@ -488,28 +487,21 @@ Please select the first menu option to execute Automatic Speech Recognition. The following example illustrates application output: ```log -[INFO] Running inference on audio clip 0 => anotherdoor.wav -[INFO] Inference 1/2 -[INFO] Profile for pre-processing: - Active NPU cycles: 0 - Idle NPU cycles: 6 - -[INFO] Profile for Inference: - Active NPU cycles: 28924342 - Idle NPU cycles: 824 - -[INFO] Inference 2/2 -[INFO] Profile for pre-processing: - Active NPU cycles: 0 - Idle NPU cycles: 6 - -[INFO] Profile for Inference: - Active NPU cycles: 28924298 - Idle NPU cycles: 868 - -[INFO] Result for inf 0: and he walked immediately out o t -[INFO] Result for inf 1: he aparctment by anoer dor -[INFO] Final result: and he walked immediately out o the aparctment by anoer dor +INFO - Running inference on audio clip 0 => another_door.wav +INFO - Inference 1/2 +INFO - Inference 2/2 +INFO - Final results: +INFO - Total number of inferences: 2 +INFO - For timestamp: 0.000000 (inference #: 0); label: and he walked immediately out of th +INFO - For timestamp: 0.000000 (inference #: 1); label: e apartment by another door +INFO - Complete recognition: and he walked immediately out of the apartment by another door +INFO - Profile for Inference : +INFO - NPU AXI0_RD_DATA_BEAT_RECEIVED cycles: 6564262 +INFO - NPU AXI0_WR_DATA_BEAT_WRITTEN cycles: 928889 +INFO - NPU AXI1_RD_DATA_BEAT_RECEIVED cycles: 841712 +INFO - NPU ACTIVE cycles: 28450696 +INFO - NPU IDLE cycles: 476 +INFO - NPU total cycles: 28451172 ``` It could take several minutes to complete each inference (average time is 5-7 minutes), and on this audio clip multiple @@ -519,9 +511,19 @@ The profiling section of the log shows that for the first inference: - Ethos-U55's PMU report: - - 28,924,298 active cycles: number of NPU cycles that were used for computation + - 28,451,172 total cycle: The number of NPU cycles - - 868 idle cycles: number of cycles for which the NPU was idle + - 28,450,696 active cycles: number of NPU cycles that were used for computation + + - 476 idle cycles: number of cycles for which the NPU was idle + + - 6,564,262 AXI0 read cycles: The number of cycles the NPU spends to execute AXI0 read transactions. + AXI0 is the bus where Ethos-U55 NPU reads and writes to the computation buffers (activation buf/tensor arenas). + + - 928,889 AXI0 write cycles: The number of cycles the NPU spends to execute AXI0 write transactions. + + - 841,712 AXI1 read cycles: The number of cycles the NPU spends to execute AXI1 read transactions. + AXI1 is the bus where Ethos-U55 NPU reads the model (read only) - For FPGA platforms, CPU cycle count can also be enabled. For FVP, however, CPU cycle counters should not be used as the CPU model is not cycle-approximate or cycle-accurate. diff --git a/docs/use_cases/img_class.md b/docs/use_cases/img_class.md index 7a409f2..b26b746 100644 --- a/docs/use_cases/img_class.md +++ b/docs/use_cases/img_class.md @@ -371,40 +371,39 @@ Choice: 4. “Show NN model info” menu option prints information about model data type, input and output tensor sizes: ```log - [INFO] uTFL version: 2.5.0 - [INFO] Model info: - [INFO] Model INPUT tensors: - [INFO] tensor type is UINT8 - [INFO] tensor occupies 150528 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 224 - [INFO] 2: 224 - [INFO] 3: 3 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.007812 - [INFO] ZeroPoint[0] = 128 - [INFO] Model OUTPUT tensors: - [INFO] tensor type is UINT8 - [INFO] tensor occupies 1001 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 1001 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.098893 - [INFO] ZeroPoint[0] = 58 - [INFO] Activation buffer (a.k.a tensor arena) size used: 521760 - [INFO] Number of operators: 1 - [INFO] Operator 0: ethos-u - [INFO] Use of Arm uNPU is enabled + INFO - uTFL version: 2.5.0 + INFO - Model info: + INFO - Model INPUT tensors: + INFO - tensor type is UINT8 + INFO - tensor occupies 150528 bytes with dimensions + INFO - 0: 1 + INFO - 1: 224 + INFO - 2: 224 + INFO - 3: 3 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.007812 + INFO - ZeroPoint[0] = 128 + INFO - Model OUTPUT tensors: + INFO - tensor type is UINT8 + INFO - tensor occupies 1001 bytes with dimensions + INFO - 0: 1 + INFO - 1: 1001 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.098893 + INFO - ZeroPoint[0] = 58 + INFO - Activation buffer (a.k.a tensor arena) size used: 521760 + INFO - Number of operators: 1 + INFO - Operator 0: ethos-u ``` 5. “List Images” menu option prints a list of pair image indexes - the original filenames embedded in the application: ```log - [INFO] List of Files: - [INFO] 0 => cat.bmp - [INFO] 1 => dog.bmp - [INFO] 2 => kimono.bmp - [INFO] 3 => tiger.bmp + INFO - List of Files: + INFO - 0 => cat.bmp + INFO - 1 => dog.bmp + INFO - 2 => kimono.bmp + INFO - 3 => tiger.bmp ``` ### Running Image Classification @@ -414,16 +413,21 @@ Please select the first menu option to execute Image Classification. The following example illustrates application output for classification: ```log -[INFO] Running inference on image 0 => cat.bmp -[INFO] Profile for Inference: - Active NPU cycles: 7622641 - Idle NPU cycles: 525 - -[INFO] 0) 282 (14.636096) -> tabby, tabby cat -[INFO] 1) 286 (14.537203) -> Egyptian cat -[INFO] 2) 283 (12.757138) -> tiger cat -[INFO] 3) 458 (7.021370) -> bow tie, bow-tie, bowtie -[INFO] 4) 288 (7.021370) -> lynx, catamount +INFO - Running inference on image 0 => cat.bmp +INFO - Final results: +INFO - Total number of inferences: 1 +INFO - 0) 282 (14.636096) -> tabby, tabby cat +INFO - 1) 286 (14.537203) -> Egyptian cat +INFO - 2) 283 (12.757138) -> tiger cat +INFO - 3) 458 (7.021370) -> bow tie, bow-tie, bowtie +INFO - 4) 288 (7.021370) -> lynx, catamount +INFO - Profile for Inference: +INFO - NPU AXI0_RD_DATA_BEAT_RECEIVED cycles: 2489726 +INFO - NPU AXI0_WR_DATA_BEAT_WRITTEN cycles: 1098726 +INFO - NPU AXI1_RD_DATA_BEAT_RECEIVED cycles: 471129 +INFO - NPU ACTIVE cycles: 7489258 +INFO - NPU IDLE cycles: 914 +INFO - NPU total cycles: 7490172 ``` It could take several minutes to complete one inference run (average time is 2-3 minutes). @@ -435,9 +439,19 @@ The profiling section of the log shows that for this inference: - Ethos-U55's PMU report: - - 7,622,641 active cycles: number of NPU cycles that were used for computation + - 7,490,172 total cycle: The number of NPU cycles - - 525 idle cycles: number of cycles for which the NPU was idle + - 7,489,258 active cycles: number of NPU cycles that were used for computation + + - 914 idle cycles: number of cycles for which the NPU was idle + + - 2,489,726 AXI0 read cycles: The number of cycles the NPU spends to execute AXI0 read transactions. + AXI0 is the bus where Ethos-U55 NPU reads and writes to the computation buffers (activation buf/tensor arenas). + + - 1,098,726 AXI0 write cycles: The number of cycles the NPU spends to execute AXI0 write transactions. + + - 471,129 AXI1 read cycles: The number of cycles the NPU spends to execute AXI1 read transactions. + AXI1 is the bus where Ethos-U55 NPU reads the model (read only) - For FPGA platforms, CPU cycle count can also be enabled. For FVP, however, CPU cycle counters should not be used as the CPU model is not cycle-approximate or cycle-accurate. diff --git a/docs/use_cases/inference_runner.md b/docs/use_cases/inference_runner.md index ffb205e..350c1e8 100644 --- a/docs/use_cases/inference_runner.md +++ b/docs/use_cases/inference_runner.md @@ -278,9 +278,14 @@ After the application has started the inference starts immediately and it output The following example illustrates application output: ```log -[INFO] Profile for Inference: - Active NPU cycles: 26976 - Idle NPU cycles: 196 +INFO - Final results: +INFO - Profile for Inference : +INFO - NPU AXI0_RD_DATA_BEAT_RECEIVED cycles: 9332 +INFO - NPU AXI0_WR_DATA_BEAT_WRITTEN cycles: 3248 +INFO - NPU AXI1_RD_DATA_BEAT_RECEIVED cycles: 2219 +INFO - NPU ACTIVE cycles: 33145 +INFO - NPU IDLE cycles: 1033 +INFO - NPU total cycles: 34178 ``` After running an inference on randomly generated data, the output of the log shows the profiling results that for this @@ -288,9 +293,19 @@ inference: - Ethos-U55's PMU report: - - 26,976 active cycles: number of cycles that were used for computation + - 34,178 total cycle: The number of NPU cycles - - 196 idle cycles: number of cycles for which the NPU was idle + - 33,145 active cycles: number of NPU cycles that were used for computation + + - 1,033 idle cycles: number of cycles for which the NPU was idle + + - 2,219 AXI0 read cycles: The number of cycles the NPU spends to execute AXI0 read transactions. + AXI0 is the bus where Ethos-U55 NPU reads and writes to the computation buffers (activation buf/tensor arenas). + + - 3,248 AXI0 write cycles: The number of cycles the NPU spends to execute AXI0 write transactions. + + - 9,332 AXI1 read cycles: The number of cycles the NPU spends to execute AXI1 read transactions. + AXI1 is the bus where Ethos-U55 NPU reads the model (read only) - For FPGA platforms, CPU cycle count can also be enabled. For FVP, however, CPU cycle counters should not be used as the CPU model is not cycle-approximate or cycle-accurate. diff --git a/docs/use_cases/kws.md b/docs/use_cases/kws.md index 316b501..4942744 100644 --- a/docs/use_cases/kws.md +++ b/docs/use_cases/kws.md @@ -405,41 +405,40 @@ Choice: 4. “Show NN model info” menu option prints information about model data type, input and output tensor sizes: ```log - [INFO] uTFL version: 2.5.0 - [INFO] Model info: - [INFO] Model INPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 490 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 1 - [INFO] 2: 49 - [INFO] 3: 10 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 1.107164 - [INFO] ZeroPoint[0] = 95 - [INFO] Model OUTPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 12 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 12 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.003906 - [INFO] ZeroPoint[0] = -128 - [INFO] Activation buffer (a.k.a tensor arena) size used: 72848 - [INFO] Number of operators: 1 - [INFO] Operator 0: ethos-u - [INFO] Use of Arm uNPU is enabled + INFO - uTFL version: 2.5.0 + INFO - Model info: + INFO - Model INPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 490 bytes with dimensions + INFO - 0: 1 + INFO - 1: 1 + INFO - 2: 49 + INFO - 3: 10 + INFO - Quant dimension: 0 + INFO - Scale[0] = 1.107164 + INFO - ZeroPoint[0] = 95 + INFO - Model OUTPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 12 bytes with dimensions + INFO - 0: 1 + INFO - 1: 12 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.003906 + INFO - ZeroPoint[0] = -128 + INFO - Activation buffer (a.k.a tensor arena) size used: 72848 + INFO - Number of operators: 1 + INFO - Operator 0: ethos-u ``` 5. “List audio clips” menu option prints a list of pair audio indexes - the original filenames embedded in the application: ```log - [INFO] List of Files: - [INFO] 0 => down.wav - [INFO] 1 => rightleftup.wav - [INFO] 2 => yes.wav - [INFO] 3 => yesnogostop.wav + INFO - List of Files: + INFO - 0 => down.wav + INFO - 1 => rightleftup.wav + INFO - 2 => yes.wav + INFO - 3 => yesnogostop.wav ``` ### Running Keyword Spotting @@ -448,15 +447,18 @@ Selecting the first option will run inference on the first file. The following example illustrates application output for classification: -```log -[INFO] Running inference on audio clip 0 => down.wav -[INFO] Inference 1/1 -[INFO] Profile for Inference: - Active NPU cycles: 680400 - Idle NPU cycles: 766 - -[INFO] For timestamp: 0.000000 (inference #: 0); threshold: 0.900000 -[INFO] label @ 0: down, score: 0.996094 +```logINFO - Running inference on audio clip 0 => down.wav +INFO - Inference 1/1 +INFO - Final results: +INFO - Total number of inferences: 1 +INFO - For timestamp: 0.000000 (inference #: 0); label: down, score: 0.996094; threshold: 0.900000 +INFO - Profile for Inference: +INFO - NPU AXI0_RD_DATA_BEAT_RECEIVED cycles: 217385 +INFO - NPU AXI0_WR_DATA_BEAT_WRITTEN cycles: 82607 +INFO - NPU AXI1_RD_DATA_BEAT_RECEIVED cycles: 59608 +INFO - NPU ACTIVE cycles: 680611 +INFO - NPU IDLE cycles: 561 +INFO - NPU total cycles: 681172 ``` Each inference should take less than 30 seconds on most systems running Fast Model. @@ -464,9 +466,19 @@ The profiling section of the log shows that for this inference: - Ethos-U55's PMU report: - - 680,400 active cycles: number of cycles that were used for computation + - 681,172 total cycle: The number of NPU cycles + + - 680,611 active cycles: The number of NPU cycles that were used for computation + + - 561 idle cycles: number of cycles for which the NPU was idle + + - 217,385 AXI0 read cycles: The number of cycles the NPU spends to execute AXI0 read transactions. + AXI0 is the bus where Ethos-U55 NPU reads and writes to the computation buffers (activation buf/tensor arenas). + + - 82,607 write cycles: The number of cycles the NPU spends to execute AXI0 write transactions. - - 766 idle cycles: number of cycles for which the NPU was idle + - 59,608 AXI1 read cycles: The number of cycles the NPU spends to execute AXI1 read transactions. + AXI1 is the bus where Ethos-U55 NPU reads the model (read only) - For FPGA platforms, CPU cycle count can also be enabled. For FVP, however, CPU cycle counters should not be used as the CPU model is not cycle-approximate or cycle-accurate. diff --git a/docs/use_cases/kws_asr.md b/docs/use_cases/kws_asr.md index e79b887..132a82d 100644 --- a/docs/use_cases/kws_asr.md +++ b/docs/use_cases/kws_asr.md @@ -468,72 +468,72 @@ Choice: 4. “Show NN model info” menu option prints information about model data type, input and output tensor sizes: ```log - [INFO] uTFL version: 2.5.0 - [INFO] Model INPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 490 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 1 - [INFO] 2: 49 - [INFO] 3: 10 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 1.107164 - [INFO] ZeroPoint[0] = 95 - [INFO] Model OUTPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 12 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 12 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.003906 - [INFO] ZeroPoint[0] = -128 - [INFO] Activation buffer (a.k.a tensor arena) size used: 123616 - [INFO] Number of operators: 16 - [INFO] Operator 0: RESHAPE - [INFO] Operator 1: CONV_2D - [INFO] Operator 2: DEPTHWISE_CONV_2D - [INFO] Operator 3: CONV_2D - [INFO] Operator 4: DEPTHWISE_CONV_2D - [INFO] Operator 5: CONV_2D - [INFO] Operator 6: DEPTHWISE_CONV_2D - [INFO] Operator 7: CONV_2D - [INFO] Operator 8: DEPTHWISE_CONV_2D - [INFO] Operator 9: CONV_2D - [INFO] Operator 10: DEPTHWISE_CONV_2D - [INFO] Operator 11: CONV_2D - [INFO] Operator 12: AVERAGE_POOL_2D - [INFO] Operator 13: RESHAPE - [INFO] Operator 14: FULLY_CONNECTED - [INFO] Operator 15: SOFTMAX - [INFO] Model INPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 11544 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 296 - [INFO] 2: 39 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.110316 - [INFO] ZeroPoint[0] = -11 - [INFO] Model OUTPUT tensors: - [INFO] tensor type is INT8 - [INFO] tensor occupies 4292 bytes with dimensions - [INFO] 0: 1 - [INFO] 1: 1 - [INFO] 2: 148 - [INFO] 3: 29 - [INFO] Quant dimension: 0 - [INFO] Scale[0] = 0.003906 - [INFO] ZeroPoint[0] = -128 - [INFO] Activation buffer (a.k.a tensor arena) size used: 809808 - [INFO] Number of operators: 1 - [INFO] Operator 0: ethos-u + INFO - uTFL version: 2.5.0 + INFO - Model INPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 490 bytes with dimensions + INFO - 0: 1 + INFO - 1: 1 + INFO - 2: 49 + INFO - 3: 10 + INFO - Quant dimension: 0 + INFO - Scale[0] = 1.107164 + INFO - ZeroPoint[0] = 95 + INFO - Model OUTPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 12 bytes with dimensions + INFO - 0: 1 + INFO - 1: 12 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.003906 + INFO - ZeroPoint[0] = -128 + INFO - Activation buffer (a.k.a tensor arena) size used: 123616 + INFO - Number of operators: 16 + INFO - Operator 0: RESHAPE + INFO - Operator 1: CONV_2D + INFO - Operator 2: DEPTHWISE_CONV_2D + INFO - Operator 3: CONV_2D + INFO - Operator 4: DEPTHWISE_CONV_2D + INFO - Operator 5: CONV_2D + INFO - Operator 6: DEPTHWISE_CONV_2D + INFO - Operator 7: CONV_2D + INFO - Operator 8: DEPTHWISE_CONV_2D + INFO - Operator 9: CONV_2D + INFO - Operator 10: DEPTHWISE_CONV_2D + INFO - Operator 11: CONV_2D + INFO - Operator 12: AVERAGE_POOL_2D + INFO - Operator 13: RESHAPE + INFO - Operator 14: FULLY_CONNECTED + INFO - Operator 15: SOFTMAX + INFO - Model INPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 11544 bytes with dimensions + INFO - 0: 1 + INFO - 1: 296 + INFO - 2: 39 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.110316 + INFO - ZeroPoint[0] = -11 + INFO - Model OUTPUT tensors: + INFO - tensor type is INT8 + INFO - tensor occupies 4292 bytes with dimensions + INFO - 0: 1 + INFO - 1: 1 + INFO - 2: 148 + INFO - 3: 29 + INFO - Quant dimension: 0 + INFO - Scale[0] = 0.003906 + INFO - ZeroPoint[0] = -128 + INFO - Activation buffer (a.k.a tensor arena) size used: 809808 + INFO - Number of operators: 1 + INFO - Operator 0: ethos-u ``` 5. “List” menu option prints a list of pair ... indexes - the original filenames embedded in the application: ```log - [INFO] List of Files: - [INFO] 0 => yesnogostop.wav + INFO - List of Files: + INFO - 0 => yesnogostop.wav ``` ### Running Keyword Spotting and Automatic Speech Recognition @@ -543,29 +543,31 @@ Please select the first menu option to execute Keyword Spotting and Automatic Sp The following example illustrates application output: ```log -[INFO] KWS audio data window size 16000 -[INFO] Running KWS inference on audio clip 0 => yesnogostop.wav -[INFO] Inference 1/7 -[INFO] Profile for Inference: - Active NPU cycles: 0 - Idle NPU cycles: 6 - -[INFO] For timestamp: 0.000000 (inference #: 0); threshold: 0.900000 -[INFO] label @ 0: yes, score: 0.996094 -[INFO] Keyword spotted -[INFO] Inference 1/2 -[INFO] Profile for Inference: - Active NPU cycles: 28924742 - Idle NPU cycles: 424 - -[INFO] Inference 2/2 -[INFO] Profile for Inference: - Active NPU cycles: 28924740 - Idle NPU cycles: 426 - -[INFO] Result for inf 0: no gow -[INFO] Result for inf 1: stoppe -[INFO] Final result: no gow stoppe +INFO - KWS audio data window size 16000 +INFO - Running KWS inference on audio clip 0 => yesnogostop.wav +INFO - Inference 1/7 +INFO - For timestamp: 0.000000 (inference #: 0); threshold: 0.900000 +INFO - label @ 0: yes, score: 0.996094 +INFO - Profile for Inference: +INFO - NPU AXI0_RD_DATA_BEAT_RECEIVED cycles: 217385 +INFO - NPU AXI0_WR_DATA_BEAT_WRITTEN cycles: 82607 +INFO - NPU AXI1_RD_DATA_BEAT_RECEIVED cycles: 59608 +INFO - NPU ACTIVE cycles: 680611 +INFO - NPU IDLE cycles: 561 +INFO - NPU total cycles: 681172 +INFO - Keyword spotted +INFO - Inference 1/2 +INFO - Inference 2/2 +INFO - Result for inf 0: no gow +INFO - Result for inf 1: stoppe +INFO - Final result: no gow stoppe +INFO - Profile for Inference: +INFO - NPU AXI0_RD_DATA_BEAT_RECEIVED cycles: 13520864 +INFO - NPU AXI0_WR_DATA_BEAT_WRITTEN cycles: 2841970 +INFO - NPU AXI1_RD_DATA_BEAT_RECEIVED cycles: 2717670 +INFO - NPU ACTIVE cycles: 28909309 +INFO - NPU IDLE cycles: 863 +INFO - NPU total cycles: 28910172 ``` It could take several minutes to complete one inference run (average time is 2-3 minutes). @@ -578,9 +580,19 @@ The profiling section of the log shows that for the ASR inference: - Ethos-U55's PMU report: - - 28,924,740 active cycles: number of cycles that were used for computation + - 28,910,172 total cycle: The number of NPU cycles - - 426 idle cycles: number of cycles for which the NPU was idle + - 28,909,309 active cycles: number of NPU cycles that were used for computation + + - 863 idle cycles: number of cycles for which the NPU was idle + + - 13,520,864 AXI0 read cycles: The number of cycles the NPU spends to execute AXI0 read transactions. + AXI0 is the bus where Ethos-U55 NPU reads and writes to the computation buffers (activation buf/tensor arenas). + + - 2,841,970 AXI0 write cycles: The number of cycles the NPU spends to execute AXI0 write transactions. + + - 2,717,670 AXI1 read cycles: The number of cycles the NPU spends to execute AXI1 read transactions. + AXI1 is the bus where Ethos-U55 NPU reads the model (read only) - For FPGA platforms, CPU cycle count can also be enabled. For FVP, however, CPU cycle counters should not be used as the CPU model is not cycle-approximate or cycle-accurate. diff --git a/source/application/hal/include/timer.h b/source/application/hal/include/timer.h index 2955b7f..4a09fd4 100644 --- a/source/application/hal/include/timer.h +++ b/source/application/hal/include/timer.h @@ -56,11 +56,9 @@ typedef struct _platform_timer { /* Gets difference in CPU cycle counts. */ uint32_t (* get_cpu_cycle_diff)(time_counter *start, time_counter *end); - /* Gets the difference in terms of total NPU cycle counts. */ - uint64_t (* get_npu_total_cycle_diff)(time_counter *start, time_counter *end); - - /* Gets the difference in terms of active NPU cycle counts. */ - uint64_t (* get_npu_active_cycle_diff)(time_counter *start, time_counter *end); + /* Gets the difference in terms of cycle counts for collected pmu counters. */ + int (* get_npu_cycles_diff)(time_counter *start, time_counter *end, + uint64_t* pmu_counters_values, size_t size); /* Wraps get_time_counter function with additional profiling * initialisation, if required. */ diff --git a/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h b/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h index f049209..ffb55e7 100644 --- a/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h +++ b/source/application/hal/platforms/bare-metal/bsp/bsp-core/include/bsp_core_log.h @@ -32,31 +32,31 @@ #endif /*LOG_LEVEL*/ #if (LOG_LEVEL == LOG_LEVEL_TRACE) - #define trace(...) printf("[TRACE] "); printf(__VA_ARGS__) + #define trace(...) printf("TRACE - "); printf(__VA_ARGS__) #else #define trace(...) #endif /* LOG_LEVEL == LOG_LEVEL_TRACE */ #if (LOG_LEVEL <= LOG_LEVEL_DEBUG) - #define debug(...) printf("[DEBUG] "); printf(__VA_ARGS__) + #define debug(...) printf("DEBUG - "); printf(__VA_ARGS__) #else #define debug(...) #endif /* LOG_LEVEL > LOG_LEVEL_TRACE */ #if (LOG_LEVEL <= LOG_LEVEL_INFO) - #define info(...) printf("[INFO] "); printf(__VA_ARGS__) + #define info(...) printf("INFO - "); printf(__VA_ARGS__) #else #define info(...) #endif /* LOG_LEVEL > LOG_LEVEL_DEBUG */ #if (LOG_LEVEL <= LOG_LEVEL_WARN) - #define warn(...) printf("[WARN] "); printf(__VA_ARGS__) + #define warn(...) printf("WARN - "); printf(__VA_ARGS__) #else #define warn(...) #endif /* LOG_LEVEL > LOG_LEVEL_INFO */ #if (LOG_LEVEL <= LOG_LEVEL_ERROR) - #define printf_err(...) printf("[ERROR] "); printf(__VA_ARGS__) + #define printf_err(...) printf("ERROR - "); printf(__VA_ARGS__) #else #define printf_err(...) #endif /* LOG_LEVEL > LOG_LEVEL_INFO */ diff --git a/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c b/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c index 7257c1d..ef31a71 100644 --- a/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c +++ b/source/application/hal/platforms/bare-metal/timer/baremetal_timer.c @@ -50,6 +50,53 @@ static uint64_t bm_get_npu_total_cycle_diff(time_counter *st, static uint64_t bm_get_npu_active_cycle_diff(time_counter *st, time_counter *end); +/** @brief Gets the difference in idle NPU cycle counts + * @param[in] st Pointer to time_counter value at start time. + * @param[in] end Pointer to time_counter value at end. + * @return Idle NPU cycle counts difference between the arguments expressed + * as unsigned 64 bit integer. + **/ +static uint64_t bm_get_npu_idle_cycle_diff(time_counter *st, + time_counter *end); + +/** @brief Gets the difference in axi0 bus reads cycle counts + * @param[in] st Pointer to time_counter value at start time. + * @param[in] end Pointer to time_counter value at end. + * @return NPU AXI0 read cycle counts difference between the arguments expressed + * as unsigned 64 bit integer. + **/ +static uint64_t bm_get_npu_axi0_read_cycle_diff(time_counter *st, + time_counter *end); + +/** @brief Gets the difference in axi0 bus writes cycle counts + * @param[in] st Pointer to time_counter value at start time. + * @param[in] end Pointer to time_counter value at end. + * @return NPU AXI0 write cycle counts difference between the arguments expressed + * as unsigned 64 bit integer. + **/ +static uint64_t bm_get_npu_axi0_write_cycle_diff(time_counter *st, + time_counter *end); + +/** @brief Gets the difference in axi1 bus reads cycle counts + * @param[in] st Pointer to time_counter value at start time. + * @param[in] end Pointer to time_counter value at end. + * @return NPU AXI1 read cycle counts difference between the arguments expressed + * as unsigned 64 bit integer. + **/ +static uint64_t bm_get_npu_axi1_read_cycle_diff(time_counter *st, + time_counter *end); + +/** @brief Gets the difference for 6 collected cycle counts: + * 1) total NPU + * 2) active NPU + * 3) idle NPU + * 4) axi0 read + * 5) axi0 write + * 6) axi1 read + * */ +static int bm_get_npu_cycle_diff(time_counter *st, time_counter *end, + uint64_t* pmu_counters_values, const size_t size); + #endif /* defined (ARM_NPU) */ #if defined(MPS3_PLATFORM) @@ -126,8 +173,7 @@ void init_timer(platform_timer *timer) #if defined (ARM_NPU) /* We are capable of reporting npu cycle counts. */ timer->cap.npu_cycles = 1; - timer->get_npu_total_cycle_diff = bm_get_npu_total_cycle_diff; - timer->get_npu_active_cycle_diff = bm_get_npu_active_cycle_diff; + timer->get_npu_cycles_diff = bm_get_npu_cycle_diff; _init_ethosu_cyclecounter(); #endif /* defined (ARM_NPU) */ @@ -149,37 +195,89 @@ static void _init_ethosu_cyclecounter(void) /* Reset overflow status. */ ETHOSU_PMU_Set_CNTR_OVS(ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CCNT_Msk); - /* Set the counter #0 to count idle cycles. */ + /* We can retrieve only 4 PMU counters: */ ETHOSU_PMU_Set_EVTYPER(0, ETHOSU_PMU_NPU_IDLE); + ETHOSU_PMU_Set_EVTYPER(1, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED); + ETHOSU_PMU_Set_EVTYPER(2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN); + ETHOSU_PMU_Set_EVTYPER(3, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED); /* Enable PMU. */ ETHOSU_PMU_Enable(); /* Enable counters for cycle and counter# 0. */ - ETHOSU_PMU_CNTR_Enable(ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CCNT_Msk); - + ETHOSU_PMU_CNTR_Enable(ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CNT2_Msk | ETHOSU_PMU_CNT3_Msk | ETHOSU_PMU_CNT4_Msk| ETHOSU_PMU_CCNT_Msk); _reset_ethosu_counters(); } +static int bm_get_npu_cycle_diff(time_counter *st, time_counter *end, + uint64_t* pmu_counters_values, const size_t size) +{ + if (size == 6) { + pmu_counters_values[0] = bm_get_npu_total_cycle_diff(st, end); + pmu_counters_values[1] = bm_get_npu_active_cycle_diff(st, end); + pmu_counters_values[2] = bm_get_npu_idle_cycle_diff(st, end); + pmu_counters_values[3] = bm_get_npu_axi0_read_cycle_diff(st, end); + pmu_counters_values[4] = bm_get_npu_axi0_write_cycle_diff(st, end); + pmu_counters_values[5] = bm_get_npu_axi1_read_cycle_diff(st, end); + return 0; + } else { + return 1; + } +} + static uint64_t bm_get_npu_total_cycle_diff(time_counter *st, time_counter *end) { return end->npu_total_ccnt - st->npu_total_ccnt; } -static uint64_t bm_get_npu_active_cycle_diff(time_counter *st, time_counter *end) +static uint32_t counter_overflow(uint32_t pmu_counter_mask) { /* Check for overflow: The idle counter is 32 bit while the total cycle count is 64 bit. */ const uint32_t overflow_status = ETHOSU_PMU_Get_CNTR_OVS(); + return pmu_counter_mask & overflow_status; +} - if (ETHOSU_PMU_CNT1_Msk & overflow_status) { +static uint64_t bm_get_npu_idle_cycle_diff(time_counter *st, time_counter *end) +{ + if (counter_overflow(ETHOSU_PMU_CNT1_Msk)) { printf_err("EthosU PMU idle counter overflow.\n"); return 0; } + return (uint64_t)(end->npu_idle_ccnt - st->npu_idle_ccnt); +} +static uint64_t bm_get_npu_active_cycle_diff(time_counter *st, time_counter *end) +{ /* Active NPU time = total time - idle time */ - return (bm_get_npu_total_cycle_diff(st, end) + - (uint64_t)(st->npu_idle_ccnt)) - (uint64_t)(end->npu_idle_ccnt); + return bm_get_npu_total_cycle_diff(st, end) - bm_get_npu_idle_cycle_diff(st, end); +} + +static uint64_t bm_get_npu_axi0_read_cycle_diff(time_counter *st, time_counter *end) +{ + if (counter_overflow(ETHOSU_PMU_CNT2_Msk)) { + printf_err("EthosU PMU axi0 read counter overflow.\n"); + return 0; + } + return (uint64_t)(end->npu_axi0_read_ccnt - st->npu_axi0_read_ccnt); +} + +static uint64_t bm_get_npu_axi0_write_cycle_diff(time_counter *st, time_counter *end) +{ + if (counter_overflow(ETHOSU_PMU_CNT3_Msk)) { + printf_err("EthosU PMU axi0 write counter overflow.\n"); + return 0; + } + return (uint64_t)(end->npu_axi0_write_ccnt - st->npu_axi0_write_ccnt); +} + +static uint64_t bm_get_npu_axi1_read_cycle_diff(time_counter *st, time_counter *end) +{ + if (counter_overflow(ETHOSU_PMU_CNT4_Msk)) { + printf_err("EthosU PMU axi1 read counter overflow.\n"); + return 0; + } + return (uint64_t)(end->npu_axi1_read_ccnt - st->npu_axi1_read_ccnt); } #endif /* defined (ARM_NPU) */ @@ -199,15 +297,22 @@ static time_counter bm_get_time_counter(void) .counter = get_time_counter(), #if defined (ARM_NPU) - .npu_idle_ccnt = ETHOSU_PMU_Get_EVCNTR(0), - .npu_total_ccnt = ETHOSU_PMU_Get_CCNTR() + .npu_total_ccnt = ETHOSU_PMU_Get_CCNTR(), + .npu_idle_ccnt = ETHOSU_PMU_Get_EVCNTR(0), + .npu_axi0_read_ccnt = ETHOSU_PMU_Get_EVCNTR(1), + .npu_axi0_write_ccnt = ETHOSU_PMU_Get_EVCNTR(2), + .npu_axi1_read_ccnt = ETHOSU_PMU_Get_EVCNTR(3) #endif /* defined (ARM_NPU) */ }; #if defined (ARM_NPU) - debug("NPU total cc: %llu; NPU idle cc: %u\n", - t.npu_total_ccnt, t.npu_idle_ccnt); + debug("NPU total cc: %llu; NPU idle cc: %u; NPU axi0 read cc: %u; NPU axi0 write cc: %u; NPU axi1 read cc: %u\n", + t.npu_total_ccnt, + t.npu_idle_ccnt, + t.npu_axi0_read_ccnt, + t.npu_axi0_write_ccnt, + t.npu_axi1_read_ccnt); #endif /* defined (ARM_NPU) */ return t; diff --git a/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h b/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h index c8fc32c..3020dac 100644 --- a/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h +++ b/source/application/hal/platforms/bare-metal/timer/include/baremetal_timer.h @@ -34,6 +34,9 @@ typedef struct bm_time_counter { #if defined (ARM_NPU) uint64_t npu_total_ccnt; uint32_t npu_idle_ccnt; + uint32_t npu_axi0_read_ccnt; + uint32_t npu_axi0_write_ccnt; + uint32_t npu_axi1_read_ccnt; #endif /* ARM_NPU */ } time_counter; diff --git a/source/application/hal/platforms/native/data_presentation/log/log.c b/source/application/hal/platforms/native/data_presentation/log/log.c index 48e8b95..6ce6684 100644 --- a/source/application/hal/platforms/native/data_presentation/log/log.c +++ b/source/application/hal/platforms/native/data_presentation/log/log.c @@ -30,11 +30,11 @@ int log_display_image(uint8_t* data, const uint32_t width, const uint32_t pos_x, const uint32_t pos_y, const uint32_t downsample_factor) { - info("Image details\n"); - info("Data: %p\n", data); - info("WxHxC: %dx%dx%d\n", width, height, channels); - info("Pos (x,y): (%d,%d)\n", pos_x, pos_y); - info("Downsampling factor: %u\n", downsample_factor); + debug("Image details\n"); + debug("Data: %p\n", data); + debug("WxHxC: %dx%dx%d\n", width, height, channels); + debug("Pos (x,y): (%d,%d)\n", pos_x, pos_y); + debug("Downsampling factor: %u\n", downsample_factor); return 0; } @@ -43,8 +43,8 @@ int log_display_text(const char* str, const size_t str_sz, const bool allow_multiple_lines) { UNUSED(allow_multiple_lines); - info("%s\n", str); - info("Text size: %lu, x: %d, y: %d\n", str_sz, pos_x, pos_y); + debug("%s\n", str); + debug("Text size: %lu, x: %d, y: %d\n", str_sz, pos_x, pos_y); return 0; } @@ -53,19 +53,19 @@ int log_display_box_icon(const uint32_t pos_x, const uint32_t pos_y, const uint32_t width, const uint32_t height, const uint16_t color) { - info("Showing rectangular, width: %d, height: %d, color: %d, x: %d, y: %d\n", + debug("Showing rectangular, width: %d, height: %d, color: %d, x: %d, y: %d\n", width, height, color, pos_x, pos_y); return 0; } int log_clear(const uint16_t color) { - info("Clearing with color: %d\n", color); + debug("Clearing with color: %d\n", color); return 0; } int log_set_text_color (const uint16_t color) { - info("Setting text color: %d\n", color); + debug("Setting text color: %d\n", color); return 0; } diff --git a/source/application/hal/platforms/native/utils/include/dummy_log.h b/source/application/hal/platforms/native/utils/include/dummy_log.h index 626436a..3df5c5c 100644 --- a/source/application/hal/platforms/native/utils/include/dummy_log.h +++ b/source/application/hal/platforms/native/utils/include/dummy_log.h @@ -32,31 +32,31 @@ #define UNUSED(x) ((void)(x)) #if (LOG_LEVEL == LOG_LEVEL_TRACE) - #define trace(...) printf("[TRACE] "); printf(__VA_ARGS__) + #define trace(...) printf("TRACE - "); printf(__VA_ARGS__) #else #define trace(...) #endif /* LOG_LEVEL == LOG_LEVEL_TRACE */ #if (LOG_LEVEL <= LOG_LEVEL_DEBUG) - #define debug(...) printf("[DEBUG] "); printf(__VA_ARGS__) + #define debug(...) printf("DEBUG - "); printf(__VA_ARGS__) #else #define debug(...) #endif /* LOG_LEVEL > LOG_LEVEL_TRACE */ #if (LOG_LEVEL <= LOG_LEVEL_INFO) - #define info(...) printf("[INFO] "); printf(__VA_ARGS__) + #define info(...) printf("INFO - "); printf(__VA_ARGS__) #else #define info(...) #endif /* LOG_LEVEL > LOG_LEVEL_DEBUG */ #if (LOG_LEVEL <= LOG_LEVEL_WARN) - #define warn(...) printf("[WARN] "); printf(__VA_ARGS__) + #define warn(...) printf("WARN - "); printf(__VA_ARGS__) #else #define warn(...) #endif /* LOG_LEVEL > LOG_LEVEL_INFO */ #if (LOG_LEVEL <= LOG_LEVEL_ERROR) - #define printf_err(...) printf("[ERROR] "); printf(__VA_ARGS__) + #define printf_err(...) printf("ERROR - "); printf(__VA_ARGS__) #else #define printf_err(...) #endif /* LOG_LEVEL > LOG_LEVEL_INFO */ diff --git a/source/application/main/Profiler.cc b/source/application/main/Profiler.cc index f364759..ce59d9c 100644 --- a/source/application/main/Profiler.cc +++ b/source/application/main/Profiler.cc @@ -17,25 +17,10 @@ #include "Profiler.hpp" #include <cstring> -#include <string> -#include <sstream> +#include <iomanip> namespace arm { namespace app { - - template<class T> - static void writeStatLine(std::ostringstream& s, - const char* desc, - T total, - uint32_t samples, - T min, - T max) - { - s << "\t" << desc << total << " / " - << ((double)total / samples) << " / " - << min << " / " << max << std::endl; - } - Profiler::Profiler(hal_platform* platform, const char* name = "Unknown") : _m_name(name) { @@ -90,99 +75,162 @@ namespace app { void Profiler::Reset() { this->_m_started = false; + this->_m_series.clear(); memset(&this->_m_tstampSt, 0, sizeof(this->_m_tstampSt)); memset(&this->_m_tstampEnd, 0, sizeof(this->_m_tstampEnd)); } - std::string Profiler::GetResultsAndReset() + void calcProfilingStat(uint64_t currentValue, + Statistics& data, + uint32_t samples) { - std::ostringstream strResults; + data.total += currentValue; + data.min = std::min(data.min, currentValue); + data.max = std::max(data.max, currentValue); + data.avrg = ((double)data.total / samples); + } + void Profiler::GetAllResultsAndReset(std::vector<ProfileResult>& results) + { for (const auto& item: this->_m_series) { auto name = item.first; ProfilingSeries series = item.second; - - uint32_t samplesNum = series.size(); - - uint64_t totalNpuCycles = 0; /* Total NPU cycles (idle + active). */ - uint64_t totalActiveNpuCycles = 0; /* Active NPU cycles. */ - uint64_t totalCpuCycles = 0; /* Total CPU cycles. */ - time_t totalTimeMs = 0; - - uint64_t minActiveNpuCycles = series[0].activeNpuCycles; - uint64_t minIdleNpuCycles = series[0].npuCycles - minActiveNpuCycles; - uint64_t minActiveCpuCycles = series[0].cpuCycles - minActiveNpuCycles; - time_t minTimeMs = series[0].time; - - uint64_t maxIdleNpuCycles = 0; - uint64_t maxActiveNpuCycles = 0; - uint64_t maxActiveCpuCycles = 0; - time_t maxTimeMs = 0; - + ProfileResult result{}; + result.name = item.first; + result.samplesNum = series.size(); + + Statistics AXI0_RD { + .name = "NPU AXI0_RD_DATA_BEAT_RECEIVED", + .unit = "cycles", + .total = 0, + .avrg = 0.0, + .min = series[0].axi0writes, + .max = 0 + }; + Statistics AXI0_WR { + .name = "NPU AXI0_WR_DATA_BEAT_WRITTEN", + .unit = "cycles", + .total = 0, + .avrg = 0.0, + .min = series[0].axi0reads, + .max = 0 + }; + Statistics AXI1_RD { + .name = "NPU AXI1_RD_DATA_BEAT_RECEIVED", + .unit = "cycles", + .total = 0, + .avrg = 0.0, + .min = series[0].axi1reads, + .max = 0 + }; + Statistics NPU_ACTIVE { + .name = "NPU ACTIVE", + .unit = "cycles", + .total = 0, + .avrg = 0.0, + .min = series[0].activeNpuCycles, + .max = 0 + }; + Statistics NPU_IDLE { + .name = "NPU IDLE", + .unit = "cycles", + .total = 0, + .avrg = 0.0, + .min = series[0].idleNpuCycles, + .max = 0 + }; + Statistics NPU_Total { + .name = "NPU total", + .unit = "cycles", + .total = 0, + .avrg = 0.0, + .min = series[0].npuCycles, + .max = 0, + }; +#if defined(CPU_PROFILE_ENABLED) + Statistics CPU_ACTIVE { + .name = "CPU ACTIVE", + .unit = "cycles (approx)", + .total = 0, + .avrg = 0.0, + .min = series[0].cpuCycles - NPU_ACTIVE.min, + .max = 0 + }; + Statistics TIME { + .name = "Time", + .unit = "ms", + .total = 0, + .avrg = 0.0, + .min = static_cast<uint64_t>(series[0].time), + .max = 0 + }; +#endif for(ProfilingUnit& unit: series){ - totalNpuCycles += unit.npuCycles; - totalActiveNpuCycles += unit.activeNpuCycles; - totalCpuCycles += unit.cpuCycles; - totalTimeMs += unit.time; - - maxActiveNpuCycles = std::max(maxActiveNpuCycles, - unit.activeNpuCycles); - maxIdleNpuCycles = std::max(maxIdleNpuCycles, - unit.npuCycles - maxActiveNpuCycles); - maxActiveCpuCycles = std::max(maxActiveCpuCycles, - unit.cpuCycles - maxActiveNpuCycles); - maxTimeMs = std::max(maxTimeMs, unit.time); - - minActiveNpuCycles = std::min(minActiveNpuCycles, - unit.activeNpuCycles); - minIdleNpuCycles = std::min(minIdleNpuCycles, - unit.npuCycles - minActiveNpuCycles); - minActiveCpuCycles = std::min(minActiveCpuCycles, - unit.cpuCycles - minActiveNpuCycles); - minTimeMs = std::min(minTimeMs, unit.time); - } - strResults << "Profile for " << name << ": " << std::endl; + calcProfilingStat(unit.npuCycles, + NPU_Total, result.samplesNum); - if (samplesNum > 1) { - strResults << "\tSamples: " << samplesNum << std::endl; - strResults << "\t Total / Avg./ Min / Max" - << std::endl; + calcProfilingStat(unit.activeNpuCycles, + NPU_ACTIVE, result.samplesNum); - writeStatLine<uint64_t>(strResults, "Active NPU cycles: ", - totalActiveNpuCycles, samplesNum, - minActiveNpuCycles, maxActiveNpuCycles); + calcProfilingStat(unit.idleNpuCycles, + NPU_IDLE, result.samplesNum); - writeStatLine<uint64_t>(strResults, "Idle NPU cycles: ", - (totalNpuCycles - totalActiveNpuCycles), - samplesNum, minIdleNpuCycles, maxIdleNpuCycles); + calcProfilingStat(unit.axi0writes, + AXI0_WR, result.samplesNum); -#if defined(CPU_PROFILE_ENABLED) - writeStatLine<uint64_t>(strResults, "Active CPU cycles (approx): ", - (totalCpuCycles - totalActiveNpuCycles), - samplesNum, minActiveCpuCycles, - maxActiveCpuCycles); + calcProfilingStat(unit.axi0reads, + AXI0_RD, result.samplesNum); - writeStatLine<time_t>(strResults, "Time in ms: ", - totalTimeMs, samplesNum, minTimeMs, maxTimeMs); -#endif - } else { - strResults << "\tActive NPU cycles: " << totalActiveNpuCycles - << std::endl; - strResults << "\tIdle NPU cycles: " - << (totalNpuCycles - totalActiveNpuCycles) - << std::endl; + calcProfilingStat(unit.axi1reads, + AXI1_RD, result.samplesNum); #if defined(CPU_PROFILE_ENABLED) - strResults << "\tActive CPU cycles: " - << (totalCpuCycles - totalActiveNpuCycles) - << " (approx)" << std::endl; + calcProfilingStat(static_cast<uint64_t>(unit.time), + TIME, result.samplesNum); - strResults << "\tTime in ms: " << totalTimeMs << std::endl; + calcProfilingStat(unit.cpuCycles - unit.activeNpuCycles, + CPU_ACTIVE, result.samplesNum); #endif } + result.data.emplace_back(AXI0_RD); + result.data.emplace_back(AXI0_WR); + result.data.emplace_back(AXI1_RD); + result.data.emplace_back(NPU_ACTIVE); + result.data.emplace_back(NPU_IDLE); + result.data.emplace_back(NPU_Total); +#if defined(CPU_PROFILE_ENABLED) + result.data.emplace_back(CPU_ACTIVE); + result.data.emplace_back(TIME); +#endif + results.emplace_back(result); } this->Reset(); - return strResults.str(); + } + + void printStatisticsHeader(uint32_t samplesNum) { + info("Number of samples: %i\n", samplesNum); + info("%s\n", "Total / Avg./ Min / Max"); + } + + void Profiler::PrintProfilingResult(bool printFullStat) { + std::vector<ProfileResult> results{}; + GetAllResultsAndReset(results); + for(ProfileResult& result: results) { + info("Profile for %s:\n", result.name.c_str()); + + if (printFullStat) { + printStatisticsHeader(result.samplesNum); + } + + for (Statistics &stat: result.data) { + if (printFullStat) { + info("%s %s: %llu / %.0f / %llu / %llu \n", stat.name.c_str(), stat.unit.c_str(), + stat.total, stat.avrg, stat.min, stat.max); + } else { + info("%s %s: %.0f\n", stat.name.c_str(), stat.unit.c_str(), stat.avrg); + } + } + } } void Profiler::SetName(const char* str) @@ -197,11 +245,19 @@ namespace app { struct ProfilingUnit unit; - if (timer->cap.npu_cycles && timer->get_npu_total_cycle_diff && - timer->get_npu_active_cycle_diff) + if (timer->cap.npu_cycles && timer->get_npu_cycles_diff) { - unit.npuCycles = timer->get_npu_total_cycle_diff(&start, &end); - unit.activeNpuCycles = timer->get_npu_active_cycle_diff(&start, &end); + const size_t size = 6; + uint64_t pmuCounters[size] = {0}; + /* 6 values: total cc, active cc, idle cc, axi0 read, axi0 write, axi1 read*/ + if (0 == timer->get_npu_cycles_diff(&start, &end, pmuCounters, size)) { + unit.npuCycles = pmuCounters[0]; + unit.activeNpuCycles = pmuCounters[1]; + unit.idleNpuCycles = pmuCounters[2]; + unit.axi0reads = pmuCounters[3]; + unit.axi0writes = pmuCounters[4]; + unit.axi1reads = pmuCounters[5]; + } } if (timer->cap.cpu_cycles && timer->get_cpu_cycle_diff) { diff --git a/source/application/main/UseCaseCommonUtils.cc b/source/application/main/UseCaseCommonUtils.cc index 4ea5e4d..3acf53f 100644 --- a/source/application/main/UseCaseCommonUtils.cc +++ b/source/application/main/UseCaseCommonUtils.cc @@ -21,16 +21,11 @@ namespace arm { namespace app { - bool RunInference(hal_platform& platform, arm::app::Model& model) + bool RunInference(arm::app::Model& model, Profiler& profiler) { - Profiler profiler{&platform, "Inference"}; - profiler.StartProfiling(); - + profiler.StartProfiling("Inference"); bool runInf = model.RunInference(); - profiler.StopProfiling(); - std::string profileResults = profiler.GetResultsAndReset(); - info("%s\n", profileResults.c_str()); return runInf; } diff --git a/source/application/main/include/Profiler.hpp b/source/application/main/include/Profiler.hpp index b16a63b..d93b257 100644 --- a/source/application/main/include/Profiler.hpp +++ b/source/application/main/include/Profiler.hpp @@ -26,10 +26,31 @@ namespace arm { namespace app { + /** Statistics for a profiling metric. */ + struct Statistics { + std::string name; + std::string unit; + std::uint64_t total; + double avrg; + std::uint64_t min; + std::uint64_t max; + }; + + /** Profiling results with calculated statistics. */ + struct ProfileResult { + std::string name; + std::uint32_t samplesNum; + std::vector<Statistics> data; + }; + /** A single profiling unit definition. */ struct ProfilingUnit { uint64_t npuCycles = 0; uint64_t activeNpuCycles = 0; + uint64_t idleNpuCycles = 0; + uint64_t axi0writes = 0; + uint64_t axi0reads = 0; + uint64_t axi1reads = 0; uint64_t cpuCycles = 0; time_t time = 0; }; @@ -73,18 +94,22 @@ namespace app { void Reset(); /** - * @brief Gets the results as string and resets the profiler. - * @returns Result string. + * @brief Collects profiling results statistics and resets the profiler. + **/ + void GetAllResultsAndReset(std::vector<ProfileResult>& results); + + /** + * @brief Prints collected profiling results and resets the profiler. **/ - std::string GetResultsAndReset(); + void PrintProfilingResult(bool printFullStat = false); /** @brief Set the profiler name. */ void SetName(const char* str); private: ProfilingMap _m_series; /* Profiling series map. */ - time_counter _m_tstampSt; /* Container for a current starting timestamp. */ - time_counter _m_tstampEnd; /* Container for a current ending timestamp. */ + time_counter _m_tstampSt{}; /* Container for a current starting timestamp. */ + time_counter _m_tstampEnd{}; /* Container for a current ending timestamp. */ hal_platform * _m_pPlatform = nullptr; /* Platform pointer - to get the timer. */ bool _m_started = false; /* Indicates profiler has been started. */ diff --git a/source/application/main/include/UseCaseCommonUtils.hpp b/source/application/main/include/UseCaseCommonUtils.hpp index 02200e8..7887aea 100644 --- a/source/application/main/include/UseCaseCommonUtils.hpp +++ b/source/application/main/include/UseCaseCommonUtils.hpp @@ -38,11 +38,11 @@ namespace app { * @brief Run inference using given model * object. If profiling is enabled, it will log the * statistics too. - * @param[in] platform Reference to the hal platform object. * @param[in] model Reference to the initialised model. + * @param[in] profiler Reference to the initialised profiler. * @return true if inference succeeds, false otherwise. **/ - bool RunInference(hal_platform& platform, arm::app::Model& model); + bool RunInference(arm::app::Model& mode, Profiler& profiler); /** * @brief Read input and return as an integer. diff --git a/source/use_case/ad/src/MainLoop.cc b/source/use_case/ad/src/MainLoop.cc index 5455b43..6a7cbe0 100644 --- a/source/use_case/ad/src/MainLoop.cc +++ b/source/use_case/ad/src/MainLoop.cc @@ -56,6 +56,8 @@ void main_loop(hal_platform& platform) /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "ad"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("clipIndex", 0); diff --git a/source/use_case/ad/src/UseCaseHandler.cc b/source/use_case/ad/src/UseCaseHandler.cc index c18a0a4..1c15595 100644 --- a/source/use_case/ad/src/UseCaseHandler.cc +++ b/source/use_case/ad/src/UseCaseHandler.cc @@ -76,6 +76,7 @@ namespace app { bool ClassifyVibrationHandler(ApplicationContext& ctx, uint32_t clipIndex, bool runAll) { auto& platform = ctx.Get<hal_platform&>("platform"); + auto& profiler = ctx.Get<Profiler&>("profiler"); constexpr uint32_t dataPsnTxtInfStartX = 20; constexpr uint32_t dataPsnTxtInfStartY = 40; @@ -98,7 +99,7 @@ namespace app { const auto frameLength = ctx.Get<int>("frameLength"); const auto frameStride = ctx.Get<int>("frameStride"); const auto scoreThreshold = ctx.Get<float>("scoreThreshold"); - const float trainingMean = ctx.Get<float>("trainingMean"); + const auto trainingMean = ctx.Get<float>("trainingMean"); auto startClipIdx = ctx.Get<uint32_t>("clipIndex"); TfLiteTensor* outputTensor = model.GetOutputTensor(0); @@ -193,7 +194,7 @@ namespace app { audioDataSlider.TotalStrides() + 1); /* Run inference over this audio clip sliding window */ - arm::app::RunInference(platform, model); + arm::app::RunInference(model, profiler); /* Use the negative softmax score of the corresponding index as the outlier score */ std::vector<float> dequantOutput = Dequantize<int8_t>(outputTensor); @@ -219,6 +220,8 @@ namespace app { return false; } + profiler.PrintProfilingResult(); + _IncrementAppCtxClipIdx(ctx); } while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx); diff --git a/source/use_case/asr/src/MainLoop.cc b/source/use_case/asr/src/MainLoop.cc index ca777be..c5a26a4 100644 --- a/source/use_case/asr/src/MainLoop.cc +++ b/source/use_case/asr/src/MainLoop.cc @@ -96,6 +96,8 @@ void main_loop(hal_platform& platform) GetLabelsVector(labels); arm::app::AsrClassifier classifier; /* Classifier wrapper object. */ + arm::app::Profiler profiler{&platform, "asr"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("clipIndex", 0); diff --git a/source/use_case/asr/src/UseCaseHandler.cc b/source/use_case/asr/src/UseCaseHandler.cc index e706eb8..5d3157a 100644 --- a/source/use_case/asr/src/UseCaseHandler.cc +++ b/source/use_case/asr/src/UseCaseHandler.cc @@ -67,6 +67,8 @@ namespace app { auto& platform = ctx.Get<hal_platform&>("platform"); platform.data_psn->clear(COLOR_BLACK); + auto& profiler = ctx.Get<Profiler&>("profiler"); + /* If the request has a valid size, set the audio index. */ if (clipIndex < NUMBER_OF_FILES) { if (!_SetAppCtxClipIdx(ctx, clipIndex)) { @@ -168,18 +170,11 @@ namespace app { info("Inference %zu/%zu\n", audioDataSlider.Index() + 1, static_cast<size_t>(ceilf(audioDataSlider.FractionalTotalStrides() + 1))); - Profiler prepProfiler{&platform, "pre-processing"}; - prepProfiler.StartProfiling(); - /* Calculate MFCCs, deltas and populate the input tensor. */ prep.Invoke(inferenceWindow, inferenceWindowLen, inputTensor); - prepProfiler.StopProfiling(); - std::string prepProfileResults = prepProfiler.GetResultsAndReset(); - info("%s\n", prepProfileResults.c_str()); - /* Run inference over this audio clip sliding window. */ - arm::app::RunInference(platform, model); + arm::app::RunInference(model, profiler); /* Post-process. */ postp.Invoke(outputTensor, reductionAxis, !audioDataSlider.HasNext()); @@ -216,6 +211,8 @@ namespace app { return false; } + profiler.PrintProfilingResult(); + _IncrementAppCtxClipIdx(ctx); } while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx); @@ -256,6 +253,8 @@ namespace app { platform.data_psn->set_text_color(COLOR_GREEN); + info("Final results:\n"); + info("Total number of inferences: %zu\n", results.size()); /* Results from multiple inferences should be combined before processing. */ std::vector<arm::app::ClassificationResult> combinedResults; for (auto& result : results) { @@ -268,8 +267,9 @@ namespace app { for (const auto & result : results) { std::string infResultStr = audio::asr::DecodeOutput(result.m_resultVec); - info("Result for inf %u: %s\n", result.m_inferenceNumber, - infResultStr.c_str()); + info("For timestamp: %f (inference #: %u); label: %s\n", + result.m_timeStamp, result.m_inferenceNumber, + infResultStr.c_str()); } /* Get the decoded result for the combined result. */ @@ -280,7 +280,7 @@ namespace app { dataPsnTxtStartX1, dataPsnTxtStartY1, allow_multiple_lines); - info("Final result: %s\n", finalResultStr.c_str()); + info("Complete recognition: %s\n", finalResultStr.c_str()); return true; } diff --git a/source/use_case/img_class/src/MainLoop.cc b/source/use_case/img_class/src/MainLoop.cc index 469907c..66d7064 100644 --- a/source/use_case/img_class/src/MainLoop.cc +++ b/source/use_case/img_class/src/MainLoop.cc @@ -58,6 +58,8 @@ void main_loop(hal_platform& platform) /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "img_class"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("imgIndex", 0); diff --git a/source/use_case/img_class/src/UseCaseHandler.cc b/source/use_case/img_class/src/UseCaseHandler.cc index a412fec..f7e83f5 100644 --- a/source/use_case/img_class/src/UseCaseHandler.cc +++ b/source/use_case/img_class/src/UseCaseHandler.cc @@ -74,6 +74,7 @@ namespace app { bool ClassifyImageHandler(ApplicationContext& ctx, uint32_t imgIndex, bool runAll) { auto& platform = ctx.Get<hal_platform&>("platform"); + auto& profiler = ctx.Get<Profiler&>("profiler"); constexpr uint32_t dataPsnImgDownscaleFactor = 2; constexpr uint32_t dataPsnImgStartX = 10; @@ -144,7 +145,7 @@ namespace app { info("Running inference on image %u => %s\n", ctx.Get<uint32_t>("imgIndex"), get_filename(ctx.Get<uint32_t>("imgIndex"))); - RunInference(platform, model); + RunInference(model, profiler); /* Erase. */ str_inf = std::string(str_inf.size(), ' '); @@ -167,6 +168,8 @@ namespace app { return false; } + profiler.PrintProfilingResult(); + _IncrementAppCtxImageIdx(ctx); } while (runAll && ctx.Get<uint32_t>("imgIndex") != curImIdx); @@ -230,6 +233,8 @@ namespace app { uint32_t rowIdx1 = dataPsnTxtStartY1 + 2 * dataPsnTxtYIncr; uint32_t rowIdx2 = dataPsnTxtStartY2; + info("Final results:\n"); + info("Total number of inferences: 1\n"); for (uint32_t i = 0; i < results.size(); ++i) { std::string resultStr = std::to_string(i + 1) + ") " + diff --git a/source/use_case/inference_runner/src/MainLoop.cc b/source/use_case/inference_runner/src/MainLoop.cc index b110a24..26a20de 100644 --- a/source/use_case/inference_runner/src/MainLoop.cc +++ b/source/use_case/inference_runner/src/MainLoop.cc @@ -38,6 +38,9 @@ void main_loop(hal_platform& platform) /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "inference_runner"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); + caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("imgIndex", 0); diff --git a/source/use_case/inference_runner/src/UseCaseHandler.cc b/source/use_case/inference_runner/src/UseCaseHandler.cc index ac4ea47..a2b3fb7 100644 --- a/source/use_case/inference_runner/src/UseCaseHandler.cc +++ b/source/use_case/inference_runner/src/UseCaseHandler.cc @@ -28,6 +28,7 @@ namespace app { bool RunInferenceHandler(ApplicationContext& ctx) { auto& platform = ctx.Get<hal_platform&>("platform"); + auto& profiler = ctx.Get<Profiler&>("profiler"); auto& model = ctx.Get<Model&>("model"); constexpr uint32_t dataPsnTxtInfStartX = 150; @@ -67,7 +68,7 @@ namespace app { str_inf.c_str(), str_inf.size(), dataPsnTxtInfStartX, dataPsnTxtInfStartY, 0); - RunInference(platform, model); + RunInference(model, profiler); /* Erase. */ str_inf = std::string(str_inf.size(), ' '); @@ -75,6 +76,9 @@ namespace app { str_inf.c_str(), str_inf.size(), dataPsnTxtInfStartX, dataPsnTxtInfStartY, 0); + info("Final results:\n"); + profiler.PrintProfilingResult(); + #if VERIFY_TEST_OUTPUT for (size_t outputIndex = 0; outputIndex < model.GetNumOutputs(); outputIndex++) { arm::app::DumpTensor(model.GetOutputTensor(outputIndex)); diff --git a/source/use_case/kws/src/MainLoop.cc b/source/use_case/kws/src/MainLoop.cc index 24cb939..f971c30 100644 --- a/source/use_case/kws/src/MainLoop.cc +++ b/source/use_case/kws/src/MainLoop.cc @@ -58,6 +58,9 @@ void main_loop(hal_platform& platform) /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "kws"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); + caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("clipIndex", 0); diff --git a/source/use_case/kws/src/UseCaseHandler.cc b/source/use_case/kws/src/UseCaseHandler.cc index 872d323..d2cba55 100644 --- a/source/use_case/kws/src/UseCaseHandler.cc +++ b/source/use_case/kws/src/UseCaseHandler.cc @@ -82,6 +82,7 @@ namespace app { bool ClassifyAudioHandler(ApplicationContext& ctx, uint32_t clipIndex, bool runAll) { auto& platform = ctx.Get<hal_platform&>("platform"); + auto& profiler = ctx.Get<Profiler&>("profiler"); constexpr uint32_t dataPsnTxtInfStartX = 20; constexpr uint32_t dataPsnTxtInfStartY = 40; @@ -215,7 +216,7 @@ namespace app { audioDataSlider.TotalStrides() + 1); /* Run inference over this audio clip sliding window. */ - arm::app::RunInference(platform, model); + arm::app::RunInference(model, profiler); std::vector<ClassificationResult> classificationResult; auto& classifier = ctx.Get<KwsClassifier&>("classifier"); @@ -243,6 +244,8 @@ namespace app { return false; } + profiler.PrintProfilingResult(); + _IncrementAppCtxClipIdx(ctx); } while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx); @@ -281,6 +284,8 @@ namespace app { constexpr uint32_t dataPsnTxtYIncr = 16; /* Row index increment. */ platform.data_psn->set_text_color(COLOR_GREEN); + info("Final results:\n"); + info("Total number of inferences: %zu\n", results.size()); /* Display each result */ uint32_t rowIdx1 = dataPsnTxtStartY1 + 2 * dataPsnTxtYIncr; @@ -290,7 +295,7 @@ namespace app { std::string topKeyword{"<none>"}; float score = 0.f; - if (results[i].m_resultVec.size()) { + if (!results[i].m_resultVec.empty()) { topKeyword = results[i].m_resultVec[0].m_label; score = results[i].m_resultVec[0].m_normalisedVal; } @@ -305,13 +310,20 @@ namespace app { dataPsnTxtStartX1, rowIdx1, false); rowIdx1 += dataPsnTxtYIncr; - info("For timestamp: %f (inference #: %u); threshold: %f\n", - results[i].m_timeStamp, results[i].m_inferenceNumber, - results[i].m_threshold); - for (uint32_t j = 0; j < results[i].m_resultVec.size(); ++j) { - info("\t\tlabel @ %u: %s, score: %f\n", j, - results[i].m_resultVec[j].m_label.c_str(), - results[i].m_resultVec[j].m_normalisedVal); + if (results[i].m_resultVec.empty()) { + info("For timestamp: %f (inference #: %u); label: %s; threshold: %f\n", + results[i].m_timeStamp, results[i].m_inferenceNumber, + topKeyword.c_str(), + results[i].m_threshold); + } else { + for (uint32_t j = 0; j < results[i].m_resultVec.size(); ++j) { + info("For timestamp: %f (inference #: %u); label: %s, score: %f; threshold: %f\n", + results[i].m_timeStamp, + results[i].m_inferenceNumber, + results[i].m_resultVec[j].m_label.c_str(), + results[i].m_resultVec[j].m_normalisedVal, + results[i].m_threshold); + } } } diff --git a/source/use_case/kws_asr/src/MainLoop.cc b/source/use_case/kws_asr/src/MainLoop.cc index 37146c9..95e5a8f 100644 --- a/source/use_case/kws_asr/src/MainLoop.cc +++ b/source/use_case/kws_asr/src/MainLoop.cc @@ -101,6 +101,9 @@ void main_loop(hal_platform& platform) /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "kws_asr"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); + caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("kwsmodel", kwsModel); caseContext.Set<arm::app::Model&>("asrmodel", asrModel); diff --git a/source/use_case/kws_asr/src/UseCaseHandler.cc b/source/use_case/kws_asr/src/UseCaseHandler.cc index c50796f..a428210 100644 --- a/source/use_case/kws_asr/src/UseCaseHandler.cc +++ b/source/use_case/kws_asr/src/UseCaseHandler.cc @@ -127,6 +127,7 @@ namespace app { KWSOutput output; + auto& profiler = ctx.Get<Profiler&>("profiler"); auto& kwsModel = ctx.Get<Model&>("kwsmodel"); if (!kwsModel.IsInited()) { printf_err("KWS model has not been initialised\n"); @@ -243,7 +244,7 @@ namespace app { audioDataSlider.TotalStrides() + 1); /* Run inference over this audio clip sliding window. */ - arm::app::RunInference(platform, kwsModel); + arm::app::RunInference(kwsModel, profiler); std::vector<ClassificationResult> kwsClassificationResult; auto& kwsClassifier = ctx.Get<KwsClassifier&>("kwsclassifier"); @@ -284,6 +285,8 @@ namespace app { return output; } + profiler.PrintProfilingResult(); + output.executionSuccess = true; return output; } @@ -300,6 +303,7 @@ namespace app { constexpr uint32_t dataPsnTxtInfStartX = 20; constexpr uint32_t dataPsnTxtInfStartY = 40; + auto& profiler = ctx.Get<Profiler&>("profiler"); auto& platform = ctx.Get<hal_platform&>("platform"); platform.data_psn->clear(COLOR_BLACK); @@ -389,18 +393,11 @@ namespace app { info("Inference %zu/%zu\n", audioDataSlider.Index() + 1, static_cast<size_t>(ceilf(audioDataSlider.FractionalTotalStrides() + 1))); - Profiler prepProfiler{&platform, "pre-processing"}; - prepProfiler.StartProfiling(); - /* Calculate MFCCs, deltas and populate the input tensor. */ asrPrep.Invoke(asrInferenceWindow, asrInferenceWindowLen, asrInputTensor); - prepProfiler.StopProfiling(); - std::string prepProfileResults = prepProfiler.GetResultsAndReset(); - info("%s\n", prepProfileResults.c_str()); - /* Run inference over this audio clip sliding window. */ - arm::app::RunInference(platform, asrModel); + arm::app::RunInference(asrModel, profiler); /* Post-process. */ asrPostp.Invoke(asrOutputTensor, reductionAxis, !audioDataSlider.HasNext()); @@ -432,6 +429,8 @@ namespace app { return false; } + profiler.PrintProfilingResult(); + return true; } diff --git a/tests/common/ProfilerTests.cc b/tests/common/ProfilerTests.cc index caf492b..1435dde 100644 --- a/tests/common/ProfilerTests.cc +++ b/tests/common/ProfilerTests.cc @@ -34,28 +34,73 @@ TEST_CASE("Common: Test Profiler") hal_init(&platform, &data_acq, &data_psn, &timer); hal_platform_init(&platform); - /* An invalid profiler shouldn't be of much use. */ - arm::app::Profiler profilerInvalid {nullptr, "test_invalid"}; - REQUIRE(false == profilerInvalid.StartProfiling()); - REQUIRE(false == profilerInvalid.StopProfiling()); - - arm::app::Profiler profilerValid{&platform, "test_valid"}; - REQUIRE(true == profilerValid.StartProfiling()); - REQUIRE(true == profilerValid.StopProfiling()); - - std::string strProfile = profilerValid.GetResultsAndReset(); - REQUIRE(std::string::npos != strProfile.find("test_valid")); - -#if defined(CPU_PROFILE_ENABLED) - /* We should have milliseconds elapsed. */ - REQUIRE(std::string::npos != strProfile.find("ms")); -#endif /* defined(CPU_PROFILE_ENABLED) */ - - /* Abuse should fail: */ - REQUIRE(false == profilerValid.StopProfiling()); /* We need to start it first. */ - REQUIRE(true == profilerValid.StartProfiling()); /* Should be able to start it fine. */ - REQUIRE(false == profilerValid.StartProfiling()); /* Can't restart it without resetting. */ - profilerValid.Reset(); /* Reset. */ - REQUIRE(true == profilerValid.StartProfiling()); /* Can start it again now. */ - REQUIRE(true == profilerValid.StopProfiling()); /* Can start it again now. */ -} + /* An invalid profiler shouldn't be of much use */ + SECTION("Test invalid profiler") { + arm::app::Profiler profilerInvalid{nullptr, "test_invalid"}; + REQUIRE(false == profilerInvalid.StartProfiling()); + REQUIRE(false == profilerInvalid.StopProfiling()); + } + + SECTION("Test valid profiler") { + arm::app::Profiler profilerValid{&platform, "test_valid"}; + REQUIRE(true == profilerValid.StartProfiling()); + REQUIRE(true == profilerValid.StopProfiling()); + std::vector<arm::app::ProfileResult> results; + profilerValid.GetAllResultsAndReset(results); + REQUIRE(results.size() == 1); + REQUIRE(results[0].name == "test_valid"); + /* Abuse should still fail: */ + REQUIRE(false == profilerValid.StopProfiling()); /* We need to start it first */ + REQUIRE(true == profilerValid.StartProfiling()); /* Should be able to start it fine */ + REQUIRE(false == profilerValid.StartProfiling()); /* Can't restart it without resetting */ + profilerValid.Reset(); + REQUIRE(true == profilerValid.StartProfiling()); /* Can start it again now.. */ + REQUIRE(true == profilerValid.StopProfiling()); + } + + SECTION("Test multiple profilers") { + arm::app::Profiler profilerValid{&platform, "one"}; + REQUIRE(true == profilerValid.StartProfiling()); + REQUIRE(true == profilerValid.StopProfiling()); + + REQUIRE(true == profilerValid.StartProfiling("two")); + REQUIRE(true == profilerValid.StopProfiling()); + REQUIRE(true == profilerValid.StartProfiling("two")); + REQUIRE(true == profilerValid.StopProfiling()); + + std::vector<arm::app::ProfileResult> results; + profilerValid.GetAllResultsAndReset(results); + REQUIRE(results.size() == 2); + REQUIRE(results[0].name == "one"); + REQUIRE(results[0].samplesNum == 1); + REQUIRE(results[1].name == "two"); + REQUIRE(results[1].samplesNum == 2); + } + +#if defined (CPU_PROFILE_ENABLED) + SECTION("Test CPU profiler") { + + arm::app::Profiler profilerCPU{&platform, "test cpu"}; + std::vector<arm::app::ProfileResult> results; + profilerCPU.StartProfiling(); + profilerCPU.StopProfiling(); + profilerCPU.GetAllResultsAndReset(results); + REQUIRE(results.size() == 1); + bool foundTime = false; + bool foundCPU_ACTIVE = false; + for(arm::app::Statistics& stat: results[0].data) { + + if (!foundTime) { + foundTime = stat.name == "Time"; + } + + if (!foundCPU_ACTIVE) { + foundCPU_ACTIVE = stat.name == "CPU ACTIVE"; + } + + } + REQUIRE(foundTime); + REQUIRE(foundCPU_ACTIVE); + } +#endif /* defined (CPU_PROFILE_ENABLED) */ +}
\ No newline at end of file diff --git a/tests/use_case/img_class/ImgClassificationUCTest.cc b/tests/use_case/img_class/ImgClassificationUCTest.cc index abfcc44..b989415 100644 --- a/tests/use_case/img_class/ImgClassificationUCTest.cc +++ b/tests/use_case/img_class/ImgClassificationUCTest.cc @@ -53,7 +53,7 @@ TEST_CASE("Inference by index", "[.]") hal_platform_init(&platform); /* Model wrapper object. */ - arm::app::MobileNetModel model; + arm::app::MobileNetModel model; /* Load the model. */ REQUIRE(model.Init()); @@ -61,6 +61,8 @@ TEST_CASE("Inference by index", "[.]") /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "img_class"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("imgIndex", 0); @@ -99,6 +101,8 @@ TEST_CASE("Inference run all images", "[.]") /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "img_class"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("imgIndex", 0); diff --git a/tests/use_case/kws/KWSHandlerTest.cc b/tests/use_case/kws/KWSHandlerTest.cc index dee2f6f..50e5a83 100644 --- a/tests/use_case/kws/KWSHandlerTest.cc +++ b/tests/use_case/kws/KWSHandlerTest.cc @@ -60,6 +60,9 @@ TEST_CASE("Inference by index") /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + + arm::app::Profiler profiler{&platform, "kws"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<int>("frameLength", g_FrameLength); /* 640 sample length for DSCNN. */ @@ -137,6 +140,8 @@ TEST_CASE("Inference run all clips") /* Instantiate application context. */ arm::app::ApplicationContext caseContext; + arm::app::Profiler profiler{&platform, "kws"}; + caseContext.Set<arm::app::Profiler&>("profiler", profiler); caseContext.Set<hal_platform&>("platform", platform); caseContext.Set<arm::app::Model&>("model", model); caseContext.Set<uint32_t>("clipIndex", 0); |