""" NNAPI Systrace parser - tracking of call tree based on trace lines See contract-between-code-and-parser.txt for the specification (cases in the specification are referred to with SPEC). """ import re import sys from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag, get_function_name_from_mark, make_tag) from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK from parser.tree import SingleThreadCallTree class AppPhase(object): """ Class to track the overall phase of the program. Used to split up warmup and benchmark. Needs to be separate from the call trees to propagate the difference to driver. """ def __init__(self): self.reset() def current(self): if self.stack: return self.stack[-1] else: return PHASE_OVERALL def push(self, phase): self.stack.append(phase) def pop(self): self.stack.pop() def reset(self): self.stack = [] class Tracker(object): """ Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree as well as keeping track of entry and exit times for functions. Exposes statistics for a single thread, transforming the call tree as needed. All statistics are in milliseconds. Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat(). Subphases of Execution are aggregated towards the overall Execution phase as needed. """ def __init__(self, tgid, is_driver, app_phase): self.tgid = tgid self.is_driver = is_driver self.app_phase = app_phase # Match the trace string # "[NN_LA_PP]funcE1" in "B||[NN_LA_PP]funcE1" # "[NN_LC_PCO]funcC1" in "B||[SW][NN_LC_PCO]funcC1" self.matcher = re.compile(r"B\|\d+\|.*\[([^]]+)\]\[?([^]])\]?") self.reset() def reset(self): self.stats = {} self.items = {} self.mytree = SingleThreadCallTree() self.begins_and_ends_ms = {} self.la_pe_counts = {} self.debugstring = "\n" def handle_mark(self, time, mark): """ Handle a single trace item (scoped entry and exit). Translates: - Automatically generated HIDL traces into NNTRACE layers and phases - SPEC:Switch phase during function into dummy items - SPEC:Subtracting time when nesting is violated into "subtract" markers - CPU/Driver layer distinction based on whether the process is the driver or an application This function is called multiple times for a single application run, afterwards the statistics can be calculated. """ if mark[0] == "B": switch = False subtract = False # Workarounds for wrong tracepoints in early versions # TODO(mikie): remove later if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark): mark = mark.replace("_PT", "_PE") # Workarounds for trace marker for getSupportedExtensions (fixed in ag/9484333) if ("getSupportedExtensions" in mark): mark = mark.replace("_PC", "_PI") elif ("[SW][NN_LA_PR]executeWithCompilation" in mark): mark = mark.replace("[SW]", "") if MARKER_SWITCH in mark: switch = True if MARKER_SUBTRACT in mark: subtract = True if switch: # End previous item self.handle_mark(time, "E") # Push a placeholder item that will get popped by the 'real' end of the # previous item. self.mytree.push_dummy(time) m = self.matcher.search(mark) if m is None: tag = translate_hidl_mark_to_nn_and_tag(mark) if tag is None: raise Exception("Couldn't parse mark " + mark) else: tag = m.group(1) [_, layer, phase] = tag.split("_") if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]: self.app_phase.push(phase) if not self.is_driver: layer = layer.replace(LAYER_DRIVER, LAYER_CPU) else: layer = layer.replace(LAYER_CPU, LAYER_DRIVER) if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION: self.la_pe_counts[self.app_phase.current()] = ( self.la_pe_counts.get(self.app_phase.current(), 0) + 1) self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract) elif mark[0] == "E": try: node = self.mytree.pop(time) if node.is_dummy(): # Placeholder item pass else: if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]: self.app_phase.pop() function = node.app_phase + "::" + get_function_name_from_mark(node.mark) self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) + [[float(node.start_time_s) * 1000.0, float(node.end_time_s) * 1000.0]]) except IndexError as e: raise Exception("Unable to process a trace termination mark, please check that the collected trace are including full application lifecycles.\n") from e def is_complete(self): """ Checks if we've seen all end tracepoints for the begin tracepoints. """ return self.mytree.current.is_root() def calculate_stats(self): assert self.is_complete() self.mytree.remove_ignored() self.mytree.remove_dummies() self.mytree.copy_subtracted_init_and_wrong_la() self.mytree.add_missing_la_nodes() # self.mytree.print() self.mytree.validate_nesting() def recurse(node, prev_layer, prev_phase, indent, in_pe_layers): [begun, mark, layer, phase] = [ node.start_time_s, node.mark, node.layer, node.phase()] time = node.end_time_s tag = None elapsed0 = "DETAIL" elapsed1 = node.elapsed_less_subtracted_ms() if elapsed1 is None: raise Exception("Elapsed for {} returned None".format(node.to_str())) if not node.is_added_detail() and not node.subtract: tag = node.app_phase + "_" + layer + "_" + phase elapsed0 = elapsed1 self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0 self.items[tag] = self.items.get(tag, []) + [ mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + tag] if phase in subphases[PHASE_EXECUTION]: if not in_pe_layers.get(layer): pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION) self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0 self.items[pe_tag] = self.items.get(pe_tag, []) + [ mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + pe_tag] if phase == PHASE_EXECUTION: in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1 for c in node.children: recurse(c, layer or prev_layer, phase or prev_phase, indent + " ", in_pe_layers) if phase == PHASE_EXECUTION: in_pe_layers[layer] = in_pe_layers[layer] - 1 return for top in self.mytree.root.children: recurse(top, None, None, "", {}) self.debugstring = self.mytree.to_str() # We need to special case the driver execution time because: # - The existing drivers don't have tracing, so we rely on HIDL traces # - Best we can do is to take the start of the HIDL server side call as # the starting point (which includes a bit of overhead, but not much) and # the start of the callback as the end point (which should be pretty # accurate) # Note that the begin and end may be on different threads, hence the # calculation needs to happen in aggregation rather than here. def get_ld_pe_begins(self, app_phase): return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server") def get_ld_pe_ends(self, app_phase): return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client") def get_stat(self, tag, app_phase, special_case_pe=True): if not self.stats and not self.mytree.is_empty(): self.calculate_stats() if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe: # Execution is exposed as an asynchronous event from the runtime, we # calculate the runtime time as starting from when the async operation is # kicked off until wait finishes + synchronous setup and teardown calls. # This has two limitations: # - multithreaded usage will not work correctly # - should the application spend so much time before calling wait that # execution has already finished, the time would get allocated to the # runtime incorrectly async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute") async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait") elapsed = 0.0 for i in range(0, len(async_starts)): elapsed = elapsed + (async_ends[i] - async_starts[i]) for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free", "ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free", "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput", "ANeuralNetworksExecution_setInputFromMemory", "ANeuralNetworksExecution_setOutputFromMemory"]: sync_starts = self.get_begins(app_phase, sync) sync_ends = self.get_ends(app_phase, sync) for i in range(0, len(sync_starts)): elapsed = elapsed + (sync_ends[i] - sync_starts[i]) return elapsed return self.stats.get(app_phase + "_" + tag, 0.0) def get_execution_count(self, app_phase): # ANeuralNetworksExecution_create is reliable and comes from the runtime, # but not available pre-P count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create")) if count > 0: return count # Application may have added tracepoints return self.la_pe_counts.get(app_phase, 0) def get_begins(self, app_phase, function): name = app_phase + "::" + function return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])] def get_ends(self, app_phase, function): name = app_phase + "::" + function return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])] def print_stats(self): if not self.stats: self.calculate_stats() print(self.tgid, "Driver" if self.is_driver else "App") for tag in self.stats: print(tag, self.stats[tag]) if self.items.get(tag): for item in self.items[tag]: print(" ", item) else: print(" ", "calculated only") def print(self): self.mytree.print()