""" NNAPI systrace parser - aggegation of timing from multiple threads """ # TODO: # - phase and layer totals import math from parser.naming import layers, phases, subphases from parser.naming import (PHASE_OVERALL, PHASE_TERMINATION, PHASE_WARMUP, PHASE_BENCHMARK, PHASE_EXECUTION, PHASE_INITIALIZATION, PHASE_INPUTS_AND_OUTPUTS, PHASE_RESULTS) from parser.naming import LAYER_APPLICATION, LAYER_IPC, LAYER_DRIVER LAYER_TOTAL = "LT" # Total across layers def aggregate_times(tracker_map, special_case_lr_pe=True): """ Takes the trackers for each thread and produces timing statistics for all layers and phases. Returns (times, self_times, has_warmup and has_benchmark, execution_counts), where: - times and self_times are nested dictionaries of the form phase -> layer -> time with the following notes: - phase is flattened over all phases, except PHASE_WARMUP and PHASE_BENCHMARK, where the structure is phase -> phase -> layer -> time - PHASE_WARMUP and PHASE_BENCHMARK only nest execution and its subphases - PHASE_WARMUP and PHASE_BENCHMARK are not present if the trace does not contain them - the first level phase contains total over PHASE_WARMUP and PHASE_BENCHMARK if present - time may be math.nan if the data is not present in the trace - in addition to the layer from parser.naming, LAYER_TOTAL holds the total time spent in that layer over all phases - execution_counts contains a dictionary of the form {PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK} -> no of executions """ all_application_phases = [PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK] # Calculate execution counts execution_counts = dict() for app_phase in all_application_phases: execution_count = 0 for pid in tracker_map: execution_count = max(execution_count, tracker_map[pid].get_execution_count(app_phase)) execution_counts[app_phase] = execution_count has_warmup = bool(execution_counts[PHASE_WARMUP]) has_benchmark = bool(execution_counts[PHASE_BENCHMARK]) if not (has_warmup and has_benchmark): all_application_phases = [PHASE_OVERALL] # Create dicts times = {} self_times = {} if has_warmup and has_benchmark: for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]: times[app_phase] = {} self_times[app_phase] = {} for phase in _phase_and_subphases(PHASE_EXECUTION): times[app_phase][phase] = {} self_times[app_phase][phase] = {} for phase in phases + [PHASE_OVERALL] + subphases[PHASE_EXECUTION]: times[phase] = {} self_times[phase] = {} # Gather total times from all threads, calculate layer and phase totals for layer in layers: for phase0 in [PHASE_OVERALL] + phases: for phase in _phase_and_subphases(phase0): t = 0.0 tag = layer + "_" + phase for app_phase in all_application_phases: t0 = 0.0 if layer == LAYER_DRIVER and phase == PHASE_EXECUTION: # Calculate driver execution times from begins and ends begins = [] ends = [] for pid in tracker_map: begins = begins + tracker_map[pid].get_ld_pe_begins(app_phase) ends = ends + tracker_map[pid].get_ld_pe_ends(app_phase) assert len(begins) == len(ends) begins.sort() ends.sort() for i in range(0, len(begins)): t0 += (ends[i] - begins[i]) else: for pid in tracker_map: t0 += tracker_map[pid].get_stat(tag, app_phase, special_case_lr_pe) if phase0 == PHASE_EXECUTION and (app_phase != PHASE_OVERALL): times[app_phase][phase][layer] = zero_to_nan_if_missing(t0, phase, layer) t += t0 times[phase][layer] = zero_to_nan_if_missing(t, phase, layer) if not times[PHASE_OVERALL][layer]: times[PHASE_OVERALL][layer] = sum(nan_to_zero(times[phase][layer]) for phase in phases) for phase0 in [PHASE_OVERALL] + phases: for phase in _phase_and_subphases(phase0): times[phase][LAYER_TOTAL] = max_ignoring_nans(times[phase].values()) if phase0 == PHASE_EXECUTION and (has_warmup and has_benchmark): for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]: times[app_phase][phase][LAYER_TOTAL] = max_ignoring_nans(times[app_phase][phase].values()) # Calculate self-times for each layer for phase0 in [PHASE_OVERALL] + phases: for phase in _phase_and_subphases(phase0): self_times[phase][LAYER_TOTAL] = times[phase][LAYER_TOTAL] if phase0 == PHASE_EXECUTION and (has_warmup and has_benchmark): for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]: self_times[app_phase][phase][LAYER_TOTAL] = times[app_phase][phase][LAYER_TOTAL] t = 0.0 for layer in reversed(layers): if math.isnan(times[phase][layer]): self_times[phase][layer] = math.nan elif times[phase][layer] == 0.0: self_times[phase][layer] = 0.0 elif (phase == PHASE_OVERALL and (layer == LAYER_DRIVER or layer == LAYER_IPC) and times[PHASE_EXECUTION][LAYER_DRIVER] == 0.0): # Driver was only used for initialization phase, did not support # execution of the model if layer == LAYER_DRIVER: self_times[phase][layer] = times[phase][layer] else: self_times[phase][layer] = times[phase][layer] - times[phase][LAYER_DRIVER] else: self_times[phase][layer] = times[phase][layer] - t t = times[phase][layer] if phase0 == PHASE_EXECUTION and (has_benchmark or has_warmup): for app_phase in [PHASE_WARMUP, PHASE_BENCHMARK]: t = 0.0 for layer in reversed(layers): if math.isnan(times[app_phase][phase][layer]): self_times[app_phase][phase][layer] = math.nan elif times[app_phase][phase][layer] == 0.0: self_times[app_phase][phase][layer] = 0.0 else: self_times[app_phase][phase][layer] = times[app_phase][phase][layer] - t t = times[app_phase][phase][layer] return (times, self_times, has_warmup and has_benchmark, execution_counts) def zero_to_nan_if_missing(f, phase, layer): """ Turn zero time to a NaN to indicate missing data, when we think that the data is really missing. Data should only be missing from the Application layer (applications may not have any tracing) and the subphases of Execution in the Driver layer (other phases are discernible from the automatic HIDL tracepoints).""" if f == 0.0: if layer == LAYER_APPLICATION: return math.nan if layer == LAYER_DRIVER and phase in subphases[PHASE_EXECUTION]: return math.nan return f def nan_to_zero(f): if math.isnan(f): return 0.0 return f def _phase_and_subphases(phase): if phase == PHASE_OVERALL: return [phase] if phase == PHASE_WARMUP or phase == PHASE_BENCHMARK: return [] return [phase] + subphases.get(phase, []) def max_ignoring_nans(xs): return max(map(nan_to_zero, xs))