1""" NNAPI Systrace parser - tracking of call tree based on trace lines 2 3 See contract-between-code-and-parser.txt for the 4 specification (cases in the specification are referred to with SPEC). 5""" 6 7import re 8from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag, 9 get_function_name_from_mark, make_tag) 10from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION 11from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT 12from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK 13from parser.tree import SingleThreadCallTree 14 15class AppPhase(object): 16 """ Class to track the overall phase of the program. Used to split up warmup and benchmark. 17 18 Needs to be separate from the call trees to propagate the difference to driver. 19 """ 20 def __init__(self): 21 self.reset() 22 23 def current(self): 24 if self.stack: 25 return self.stack[-1] 26 else: 27 return PHASE_OVERALL 28 29 def push(self, phase): 30 self.stack.append(phase) 31 32 def pop(self): 33 self.stack.pop() 34 35 def reset(self): 36 self.stack = [] 37 38class Tracker(object): 39 """ Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree 40 as well as keeping track of entry and exit times for functions. 41 42 Exposes statistics for a single thread, transforming the call tree as needed. 43 All statistics are in milliseconds. 44 45 Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat(). 46 47 Subphases of Execution are aggregated towards the overall Execution phase as needed. 48 """ 49 def __init__(self, tgid, is_driver, app_phase): 50 self.tgid = tgid 51 self.is_driver = is_driver 52 self.app_phase = app_phase 53 54 # Match the trace string 55 # "[NN_LA_PP]funcE1" in "B|<thread1>|[NN_LA_PP]funcE1" 56 # "[NN_LC_PCO]funcC1" in "B|<thread1>|[SW][NN_LC_PCO]funcC1" 57 self.matcher = re.compile(r"B\|\d+\|.*\[([^]]+)\]\[?([^]])\]?") 58 59 self.reset() 60 61 def reset(self): 62 self.stats = {} 63 self.items = {} 64 self.mytree = SingleThreadCallTree() 65 self.begins_and_ends_ms = {} 66 self.la_pe_counts = {} 67 self.debugstring = "\n" 68 69 def handle_mark(self, time, mark): 70 """ Handle a single trace item (scoped entry and exit). 71 Translates: 72 - Automatically generated HIDL traces into NNTRACE layers and phases 73 - SPEC:Switch phase during function into dummy items 74 - SPEC:Subtracting time when nesting is violated into "subtract" 75 markers 76 - CPU/Driver layer distinction based on whether the process is the 77 driver or an application 78 This function is called multiple times for a single application run, 79 afterwards the statistics can be calculated. 80 """ 81 if mark[0] == "B": 82 switch = False 83 subtract = False 84 # Workarounds for wrong tracepoints in early versions 85 # TODO(mikie): remove later 86 if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark): 87 mark = mark.replace("_PT", "_PE") 88 elif ("[SW][NN_LA_PR]executeWithCompilation" in mark): 89 mark = mark.replace("[SW]", "") 90 if MARKER_SWITCH in mark: 91 switch = True 92 if MARKER_SUBTRACT in mark: 93 subtract = True 94 if switch: 95 # End previous item 96 self.handle_mark(time, "E") 97 # Push a dummy item that will get popped by the 'real' end of the 98 # previous item. 99 self.mytree.push_dummy(time) 100 m = self.matcher.search(mark) 101 if m is None: 102 tag = translate_hidl_mark_to_nn_and_tag(mark) 103 if tag is None: 104 raise Exception("Couldn't parse mark " + mark) 105 else: 106 tag = m.group(1) 107 [_, layer, phase] = tag.split("_") 108 if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]: 109 self.app_phase.push(phase) 110 if not self.is_driver: 111 layer = layer.replace(LAYER_DRIVER, LAYER_CPU) 112 else: 113 layer = layer.replace(LAYER_CPU, LAYER_DRIVER) 114 if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION: 115 self.la_pe_counts[self.app_phase.current()] = ( 116 self.la_pe_counts.get(self.app_phase.current(), 0) + 1) 117 self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract) 118 elif mark[0] == "E": 119 node = self.mytree.pop(time) 120 if node.is_dummy(): # Dummy item 121 pass 122 else: 123 if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]: 124 self.app_phase.pop() 125 function = node.app_phase + "::" + get_function_name_from_mark(node.mark) 126 self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) + 127 [[float(node.start_time_s) * 1000.0, 128 float(node.end_time_s) * 1000.0]]) 129 130 def is_complete(self): 131 """ Checks if we've seen all end tracepoints for the begin tracepoints. 132 """ 133 return self.mytree.current.is_root() 134 135 def calculate_stats(self): 136 assert self.is_complete() 137 self.mytree.remove_ignored() 138 self.mytree.remove_dummies() 139 self.mytree.copy_subtracted_init_and_wrong_la() 140 self.mytree.add_missing_la_nodes() 141 # self.mytree.print() 142 self.mytree.validate_nesting() 143 144 def recurse(node, prev_layer, prev_phase, indent, in_pe_layers): 145 [begun, mark, layer, phase] = [ 146 node.start_time_s, node.mark, node.layer, node.phase()] 147 time = node.end_time_s 148 tag = None 149 elapsed0 = "DETAIL" 150 elapsed1 = node.elapsed_less_subtracted_ms() 151 if elapsed1 is None: 152 raise Exception("Elapsed for {} returned None".format(node.to_str())) 153 154 if not node.is_added_detail() and not node.subtract: 155 tag = node.app_phase + "_" + layer + "_" + phase 156 elapsed0 = elapsed1 157 self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0 158 self.items[tag] = self.items.get(tag, []) + [ 159 mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + tag] 160 if phase in subphases[PHASE_EXECUTION]: 161 if not in_pe_layers.get(layer): 162 pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION) 163 self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0 164 self.items[pe_tag] = self.items.get(pe_tag, []) + [ 165 mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + pe_tag] 166 if phase == PHASE_EXECUTION: 167 in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1 168 for c in node.children: 169 recurse(c, layer or prev_layer, phase or prev_phase, 170 indent + " ", in_pe_layers) 171 if phase == PHASE_EXECUTION: 172 in_pe_layers[layer] = in_pe_layers[layer] - 1 173 return 174 175 for top in self.mytree.root.children: 176 recurse(top, None, None, "", {}) 177 self.debugstring = self.mytree.to_str() 178 179 # We need to special case the driver execution time because: 180 # - The existing drivers don't have tracing, so we rely on HIDL traces 181 # - Best we can do is to take the start of the HIDL server side call as 182 # the starting point (which includes a bit of overhead, but not much) and 183 # the start of the callback as the end point (which should be pretty 184 # accurate) 185 # Note that the begin and end may be on different threads, hence the 186 # calculation needs to happen in aggregation rather than here. 187 def get_ld_pe_begins(self, app_phase): 188 return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server") 189 190 def get_ld_pe_ends(self, app_phase): 191 return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client") 192 193 def get_stat(self, tag, app_phase, special_case_pe=True): 194 if not self.stats and not self.mytree.is_empty(): 195 self.calculate_stats() 196 if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe: 197 # Execution is exposed as an asynchronous event from the runtime, we 198 # calculate the runtime time as starting from when the async operation is 199 # kicked off until wait finishes + synchronous setup and teardown calls. 200 # This has two limitations: 201 # - multithreaded usage will not work correctly 202 # - should the application spend so much time before calling wait that 203 # execution has already finished, the time would get allocated to the 204 # runtime incorrectly 205 async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute") 206 async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait") 207 elapsed = 0.0 208 for i in range(0, len(async_starts)): 209 elapsed = elapsed + (async_ends[i] - async_starts[i]) 210 for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free", 211 "ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free", 212 "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput", 213 "ANeuralNetworksExecution_setInputFromMemory", 214 "ANeuralNetworksExecution_setOutputFromMemory"]: 215 sync_starts = self.get_begins(app_phase, sync) 216 sync_ends = self.get_ends(app_phase, sync) 217 for i in range(0, len(sync_starts)): 218 elapsed = elapsed + (sync_ends[i] - sync_starts[i]) 219 return elapsed 220 return self.stats.get(app_phase + "_" + tag, 0.0) 221 222 def get_execution_count(self, app_phase): 223 # ANeuralNetworksExecution_create is reliable and comes from the runtime, 224 # but not available pre-P 225 count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create")) 226 if count > 0: 227 return count 228 # Application may have added tracepoints 229 return self.la_pe_counts.get(app_phase, 0) 230 231 def get_begins(self, app_phase, function): 232 name = app_phase + "::" + function 233 return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])] 234 def get_ends(self, app_phase, function): 235 name = app_phase + "::" + function 236 return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])] 237 238 def print_stats(self): 239 if not self.stats: 240 self.calculate_stats() 241 print(self.tgid, "Driver" if self.is_driver else "App") 242 for tag in self.stats: 243 print(tag, self.stats[tag]) 244 if self.items.get(tag): 245 for item in self.items[tag]: 246 print(" ", item) 247 else: 248 print(" ", "calculated only") 249 250 def print(self): 251 self.mytree.print() 252