1Contract between NNAPI systrace code and parser 2=============================================== 3 4This text files documents how tracing in the NNAPI manifests in systrace output 5and how that output is interpreted by the systrace parser. 6 7Please view in a 160 character window. 8 9Special cases 10------------- 11 12- Execution is exposed as an asynchronous event from the runtime. Time taken 13 by the runtime is calculated as the time between start of 14 ANeuralNetworksExecution_startCompute and end of 15 ANeuralNetworksEvent_wait. This special case is not reflected in the cases 16 presented below. 17 18Notation 19-------- 20- ...: elided code 21- t_m_w: tracing_mark_write 22- tX: timestamps 23- T1, T2: thread ids 24 25 26Cases for the parser 27==================== 28 29Source code Systrace Interpretation for timing statistics - all 30 times are wallclock 31------------------------------------------------------------------------------------------------------------------------------------------------------ 32 33*Baseline case* 34 35... funcP(...) { t0: t_m_w:B|T1|[NN_LR_PP]funcP Add (t1-t0) to total time spent in Layer 36 NNTRACE_RT(NNTRACE_PHASE_PREPARATION, t1: t_m_w:E|T1 Runtime, Phase Preparation 37 "funcP); 38 ... 39} 40 41------------------------------------------------------------------------------------------------------------------------------------------------------ 42 43*Local call to other layer* 44 45... funcA1(...) { t0: t_m_w:B|T1|[NN_LA_PP]funcE1 Add (t3-t0) to total time spent in Layer 46 NNTRACE_APP(NNTRACE_PHASE_PREPARATION, t1: t_m_w:B|T1|[NN_LR_PP]funcC1 Application, Phase Preparation 47 "funcA1); t2: t_m_w:E|T1 48 ... funcR1(...); t3: t_m_w:E|T1 Add (t2-t1) to total time spent in Layer 49 ... Runtime, Phase Preparation 50} 51... funcR1(...) { Note: Self-time of Layer Application, 52 NNTRACE_RT(NNTRACE_PHASE_PREPARATION, Phase Preparation will be calculated as 53 "funcR1"); ... total time in Layer Application - total time 54} in Layer Runtime 55 56 Note: These can be nested as per rows 57 below ("Switch phase ...", "Subphases ...", 58 "Additional detail...") 59 60 Note: If the called function specifies a 61 phase that is not supposed to be nested, 62 the parser will emit a diagnostic. 63 64------------------------------------------------------------------------------------------------------------------------------------------------------ 65 66*Switch phase during function* 67 68... funcC1(...) { t0: t_m_w:B|T1|[NN_LC_PTR]funcC1 Add (t1-t0) to total time spent in Layer 69 NNTRACE_TRANS("funcC1"); t1: t_m_w:B|T1|[SW][NN_LC_PCO]funcC1 CPU, Phase Transformation 70 ... t2: t_m_w:E|T1 71 NNTRACE_COMP_SWITCH("funcC1"); t3: t_m_w:E|T1 Add (t2-t1) to total time spent in Layer 72 ... CPU, Phase Computation 73} 74 (t3-t2 treated as negligible - only the 75 destructors of objects created between 76 the tracepoints) 77 78------------------------------------------------------------------------------------------------------------------------------------------------------ 79 80*Subphases of execution* 81 82... funcR2(...) { t0: t_m_w:B|T1|[NN_LR_PE]funcR2 Add (t2-t1) to total time spent in Layer 83 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, t1: t_m_w:B|T1|[NN_LC_PCO]funcC2 CPU, (sub)Phase Computation and to 84 "funcR2); t2: t_m_w:E|T1 total time in Phase Execution 85 ... funcC2(...); t3: t_m_w:E|T1 86 ... Add (t3-t0) to total time spent in Layer 87} Runtime, Phase Execution 88... funcC2(...) { 89 NNTRACE_COMP("funcC2"); 90 ... 91} 92 93------------------------------------------------------------------------------------------------------------------------------------------------------ 94 95*Additional detail in the same layer* 96 97... funcR3(...) { t0: t_m_w:B|T1|[NN_LR_PE]funcR3 Add (t3-t0) to total time spent in Layer 98 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, t1: t_m_w:B|T1|[NN_LR_PE]funcR4 Runtime, Phase Execution 99 "funcR3); t2: t_m_w:E|T1 100 ... funcR4(...); t3: t_m_w:E|T1 Note: funcR4 will be visible in the systrace 101 ... visualization 102} 103... funcR4(...) { 104 NNTRACE_RT(NNTRACE_PHASE_EXECUTION, 105 "funcR4"); 106 ... 107} 108 109 110------------------------------------------------------------------------------------------------------------------------------------------------------ 111 112*Synchronous IPC call* 113 114... funcR5(...) { t0: t_m_w:B|T1|[NN_LR_PC]funcR5 Add (t5-t0) - (t4-r1) to total time spent in 115 NNTRACE_RT(NNTRACE_PHASE_COMPILATION, t1: t_m_w:B|T1|[NN_LI_PI]getCapabilities Layer Runtime, Phase Compilation; see 116 "funcR5"); t2: t_m_w:B|T1|HIDL::IDevice::getCapabilities::client "Onetime initialization code". 117 ... device->getCapabilities() t3: t_m_w:E|T1 118 ... t4: t_m_w:E|T1 Add (t4-t1) to total time spent in Layer 119} t5: t_m_w:E|T1 IPC, Phase Initialization 120... VersionedIDevice::getCapabilities(...) { 121 NTRACE_FULL(NNTRACE_LAYER_IPC, Note: Self-time of Layer Runtime, Phase 122 NNTRACE_PHASE_COMPILATION, Compilation will be calculated as total 123 "getCapabilities"); time in Layer Runtime - total time in Layer 124 IPC 125} 126 Note: Tracepoints are needed for the 127 client IPC calls. The HIDL tracing isn't 128 guaranteed to wait for the server - it just 129 sends the transaction even if the call is 130 synchronous. 131 132------------------------------------------------------------------------------------------------------------------------------------------------------ 133 134*Asynchronous IPC call that is synchronously waited for by the runtime* 135 136// Runtime code t0: t_m_w:B|T1|[NN_LI_PC]prepareModel Add (t10-t0) to total time spent in Layer 137... funcRC(...) { t1: t_m_w:B|T1|HIDL::IDevice::prepareModel_1_1::client IPC, Phase Compilation 138 ... t2: t_m_w:B|T2|HIDL::IDevice::prepareModel_1_1::server 139 NTRACE_FULL(NNTRACE_LAYER_IPC, t3: t_m_w:B|T2|[NN_LD_PC]SampleDriver::prepareModel Add (t6-t2) to total time spent in Layer 140 NNTRACE_PHASE_COMPILATION, t4: t_m_w:B|T2|HIDL::IPreparedModelCallback::notify::clie Driver, Phase Compilation. This includes 141 "prapareModel"); t5: t_m_w:E|T2 the generated HIDL stub code, which is 142 ... t6: t_m_w:E|T2 <0.05ms. 143 device->prepareModel(...); t7: t_m_w:E|T2 144 ... t8: t_m_w:B|T1|HIDL::IPreparedModelCallback::notify::serv Note: the HIDL trace rows are added by 145 cb->wait(); t9: t_m_w:E|T1 the automatically generated proxy and 146 ... t10: t_m_w:E|T1 stub code. For the driver side, the 147} t11: t_m_w:E|T1 mapping of the HIDL functions to layers 148 and phases is done in the systrace 149// Driver code parser. 150... SampleDriver::prepareModel(...) { 151 NNTRACE_FULL(NNTRACE_LAYER_DRIVER, Note: the SampleDriver::prepareModel is 152 NNTRACE_PHASE_COMPILATION, treated as additional detail for Layer 153 "SampleDriver::prepareModel"); Driver, Phase Compilation. 154} 155 Note: the "t_m_w" output of 156 systrace uses thread ids, so that the call 157 stack can be reconstructed. The systrace 158 rows are also annotated with process ids. 159 The parser uses the process ids to 160 distinguish between the application 161 process from the driver process (used for 162 diagnostics and for distinguishing CPU 163 fallback from sample driver). 164 165 Note: the next row in this table gives more 166 detail for prepareModel specifically 167 168 Note: the driver-side HIDL traces get us 169 the time spent in sample and hvx drivers. 170 With a different driver threading model 171 this may not be the case - future drivers 172 should add manual tracing. 173 174 TODO: attribute driver process IPC call 175 (callback) overhead to IPC layer. 176 177------------------------------------------------------------------------------------------------------------------------------------------------------ 178 179*Subtracting time when nesting is violated* 180 181// Runtime code t0: t_m_w:B|T1|[NN_LI_PC]prepareModel Add (t3 - t0) - (t2 - t1) to total time spent 182... funcRC(...) { t1: t_m_w:B|T1|[SUB][NN_LR_PC]VersionedIDevice::prepareM in Layer IPC, Phase compilation 183 ... t2: t_m_w:E|T1 184 NTRACE_FULL(NNTRACE_LAYER_IPC, t3: t_m_w:E|T1 Add (t2-t1) to total time spent in Layer 185 NNTRACE_PHASE_COMPILATION, Runtime, Phase compilation 186 "prapareModel"); 187 ... 188 device->prepareModel(...); 189 ... 190 cb->wait(); 191 ... 192} 193 194... VersionedIDevice::prepareModel(...) { 195 // IPC work 196 { 197 NNTRACE_FULL_SUBTRACT( 198 NNTRACE_LAYER_RUNTIME, 199 NNTRACE_PHASE_COMPILATION, 200 "VersionedIDevice::prepareModel"); 201 // Runtime work 202 } 203 // IPC work 204} 205 206------------------------------------------------------------------------------------------------------------------------------------------------------ 207 208*Onetime initialization code* 209 210... funcR5(...) { t0: t_m_w:B|T1|[NN_LR_PP]funcR5 Add (t2-t1) to total time spent in Layer 211 NNTRACE_RT(NNTRACE_PHASE_PREPARATION, t1: t_m_w:B|T1|[NN_LR_PI]funcI Runtime, Phase Initialization 212 "runcR5); t2: t_m_w:E|T1 213 ... funcI(...); t3: t_m_w:E|T1 Add (t3 - t0) - (t2 - t1) to total time spent 214 ... in Layer Runtime, Phase Preparation. 215} 216... funcI(...) { 217 NNTRACE_RT(NNTRACE_PHASE_INITIALIZATION, 218 "funcI") 219 ... 220} 221 222------------------------------------------------------------------------------------------------------------------------------------------------------ 223 224*Utility code* 225 226... funcR6(...) { t0: t_m_w:B|T1|[NN_LR_PP]funcR6 Add (t3-t0) to total time spent in Layer 227 NNTRACE_RT(NNTRACE_PHASE_PREPARATION, t1: t_m_w:B|T1|[NN_LU_PU]funcU Runtime, Phase Preparation 228 "funcR6"); t2: t_m_w:E|T1 229 ... funcU(...); t3: t_m_w:E|T1 Note: the funcU is treated as additional 230 ... detail. 231} 232... funcU(...) { 233 NNTRACE_FULL(NNTRACE_LAYER_UTILITY, 234 NNTRACE_PHASE_UNSPECIFIED, 235 "funcU") 236 ... 237} 238