1 /* Copyright 2024 Arm Limited and/or its affiliates.
2 *
3 * This source code is licensed under the BSD-style license found in the
4 * LICENSE file in the root directory of this source tree.
5 */
6
7 #include <cinttypes>
8 #include <vector>
9
10 #include "arm_perf_monitor.h"
11
12 #ifdef ETHOSU
13 #include <ethosu_driver.h>
14 #include <executorch/runtime/platform/log.h>
15 #include <pmu_ethosu.h>
16
17 static uint32_t ethosu_inference_count = 0;
18 static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0;
19 static uint64_t ethosu_ArmBackendExecuteCycleCount = 0;
20 static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0;
21 static uint64_t ethosu_ArmWhenNPURunCycleCount = 0;
22 static uint64_t ethosu_pmuCycleCount = 0;
23 static std::vector<uint64_t> ethosu_pmuEventCounts(
24 ETHOSU_PMU_Get_NumEventCounters(),
25 0);
26
27 #if defined(ETHOSU55) || defined(ETHOSU65)
28 static const uint32_t ethosu_pmuCountersUsed = 4;
29 #elif defined(ETHOSU85)
30 static const uint32_t ethosu_pmuCountersUsed = 5;
31 #else
32 #error No NPU target defined
33 #endif
34
35 // ethosu_pmuCountersUsed should match numbers of counters setup in
36 // ethosu_inference_begin() and not be more then the HW supports
37 static_assert(ETHOSU_PMU_NCOUNTERS >= ethosu_pmuCountersUsed);
38
39 extern "C" {
40
41 // Callback invoked at start of NPU execution
ethosu_inference_begin(struct ethosu_driver * drv,void *)42 void ethosu_inference_begin(struct ethosu_driver* drv, void*) {
43 // Enable PMU
44 ETHOSU_PMU_Enable(drv);
45 ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE);
46 ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE);
47
48 // Setup 4 counters
49 #if defined(ETHOSU55) || defined(ETHOSU65)
50 ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED);
51 ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED);
52 ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN);
53 ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_NPU_IDLE);
54 // Enable the 4 counters
55 ETHOSU_PMU_CNTR_Enable(
56 drv,
57 ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CNT2_Msk | ETHOSU_PMU_CNT3_Msk |
58 ETHOSU_PMU_CNT4_Msk);
59 #elif defined(ETHOSU85)
60 ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_SRAM_RD_DATA_BEAT_RECEIVED);
61 ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_SRAM_WR_DATA_BEAT_WRITTEN);
62 ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_EXT_RD_DATA_BEAT_RECEIVED);
63 ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_EXT_WR_DATA_BEAT_WRITTEN);
64 ETHOSU_PMU_Set_EVTYPER(drv, 4, ETHOSU_PMU_NPU_IDLE);
65 // Enable the 5 counters
66 ETHOSU_PMU_CNTR_Enable(
67 drv,
68 ETHOSU_PMU_CNT1_Msk | ETHOSU_PMU_CNT2_Msk | ETHOSU_PMU_CNT3_Msk |
69 ETHOSU_PMU_CNT4_Msk | ETHOSU_PMU_CNT5_Msk);
70 #else
71 #error No NPU target defined
72 #endif
73
74 ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk);
75 ETHOSU_PMU_CYCCNT_Reset(drv);
76
77 // Reset all counters
78 ETHOSU_PMU_EVCNTR_ALL_Reset(drv);
79
80 // Save Cortex-M cycle clock to calculate total CPU cycles used in
81 // ethosu_inference_end()
82 ethosu_ArmWhenNPURunCycleCountStart = ARM_PMU_Get_CCNTR();
83 }
84
85 // Callback invoked at end of NPU execution
ethosu_inference_end(struct ethosu_driver * drv,void *)86 void ethosu_inference_end(struct ethosu_driver* drv, void*) {
87 ethosu_inference_count++;
88 ethosu_pmuCycleCount += ETHOSU_PMU_Get_CCNTR(drv);
89
90 for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
91 ethosu_pmuEventCounts[i] += ETHOSU_PMU_Get_EVCNTR(drv, i);
92 }
93 ETHOSU_PMU_Disable(drv);
94 // Add Cortex-M cycle clock used during this NPU execution
95 ethosu_ArmWhenNPURunCycleCount +=
96 (ARM_PMU_Get_CCNTR() - ethosu_ArmWhenNPURunCycleCountStart);
97 }
98
99 // Callback invoked at start of ArmBackend::execute()
ArmBackend_execute_begin()100 void ArmBackend_execute_begin() {
101 // Save Cortex-M cycle clock to calculate total CPU cycles used in
102 // ArmBackend_execute_end()
103 ethosu_ArmBackendExecuteCycleCountStart = ARM_PMU_Get_CCNTR();
104 }
105
106 // Callback invoked at end of ArmBackend::execute()
ArmBackend_execute_end()107 void ArmBackend_execute_end() {
108 // Add Cortex-M cycle clock used during this ArmBackend::execute()
109 ethosu_ArmBackendExecuteCycleCount +=
110 (ARM_PMU_Get_CCNTR() - ethosu_ArmBackendExecuteCycleCountStart);
111 }
112 }
113
StartMeasurements()114 void StartMeasurements() {
115 ethosu_ArmBackendExecuteCycleCount = 0;
116 ethosu_ArmWhenNPURunCycleCount = 0;
117 ethosu_pmuCycleCount = 0;
118
119 for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
120 ethosu_pmuEventCounts[i] = 0;
121 }
122 ARM_PMU_Enable();
123 DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable
124 ARM_PMU_CYCCNT_Reset();
125 ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
126 }
127
StopMeasurements()128 void StopMeasurements() {
129 ARM_PMU_CNTR_Disable(
130 PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk |
131 PMU_CNTENCLR_CNT1_ENABLE_Msk);
132 uint32_t cycle_count = ARM_PMU_Get_CCNTR();
133
134 // Number of comand streams handled by the NPU
135 ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count);
136 ET_LOG(Info, "Profiler report, CPU cycles per operator:");
137 // This is number of CPU cycles for the ethos-u operator from start to finish
138 // in the framework If there is more then one commandstream the time is added
139 // together
140 ET_LOG(
141 Info,
142 "ethos-u : cycle_cnt : %d cycles",
143 ethosu_ArmBackendExecuteCycleCount);
144 // We could print a list of the cycles used by the other delegates here in the
145 // future but now we only print ethos-u: this means that "Operator(s) total:
146 // ..." will be the same number as ethos-u : cycle_cnt and not the sum of all
147 ET_LOG(
148 Info,
149 "Operator(s) total: %d CPU cycles",
150 ethosu_ArmBackendExecuteCycleCount);
151 // Total CPU cycles used in the executorch method->execute()
152 // Other delegates and no delegates are counted in this
153 ET_LOG(Info, "Inference runtime: %d CPU cycles total", cycle_count);
154
155 ET_LOG(
156 Info,
157 "NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency");
158
159 // Avoid division with zero if ARM_PMU_Get_CCNTR() is not enabled properly.
160 if (cycle_count == 0) {
161 ET_LOG(Info, "Inference CPU ratio: ?.?? %%");
162 ET_LOG(Info, "Inference NPU ratio: ?.?? %%");
163 } else {
164 ET_LOG(
165 Info,
166 "Inference CPU ratio: %.2f %%",
167 100.0 * (cycle_count - ethosu_ArmWhenNPURunCycleCount) / cycle_count);
168 ET_LOG(
169 Info,
170 "Inference NPU ratio: %.2f %%",
171 100.0 * ethosu_ArmWhenNPURunCycleCount / cycle_count);
172 }
173
174 // CPU cycles used by NPU, e.g. number of CPU cycles used between
175 // ethosu_inference_begin() and ethosu_inference_end()
176 // If there is more then one commandstream the time is added together
177 ET_LOG(
178 Info,
179 "cpu_wait_for_npu_cntr : %" PRIu64 " CPU cycles",
180 ethosu_ArmWhenNPURunCycleCount);
181
182 ET_LOG(Info, "Ethos-U PMU report:");
183 ET_LOG(Info, "ethosu_pmu_cycle_cntr : %" PRIu64, ethosu_pmuCycleCount);
184
185 for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) {
186 ET_LOG(Info, "ethosu_pmu_cntr%zd : %" PRIu64, i, ethosu_pmuEventCounts[i]);
187 }
188 #if defined(ETHOSU55) || defined(ETHOSU65)
189 ET_LOG(
190 Info,
191 "Ethos-U PMU Events:[ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]");
192 #elif defined(ETHOSU85)
193 ET_LOG(
194 Info,
195 "Ethos-U PMU Events:[ETHOSU_PMU_SRAM_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_SRAM_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_EXT_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]");
196 #else
197 #error No NPU target defined
198 #endif
199 }
200
201 #else
StartMeasurements()202 void StartMeasurements() {}
203
StopMeasurements()204 void StopMeasurements() {}
205
206 #endif
207