1# Copyright 2015-2017 ARM Limited 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# 15 16import pandas as pd 17import numpy as np 18 19"""Generic functions that can be used in multiple places in trappy 20""" 21 22def listify(to_select): 23 """Utitlity function to handle both single and 24 list inputs 25 """ 26 27 if not isinstance(to_select, list): 28 to_select = [to_select] 29 30 return to_select 31 32def handle_duplicate_index(data, 33 max_delta=0.000001): 34 """Handle duplicate values in index 35 36 :param data: The timeseries input 37 :type data: :mod:`pandas.Series` 38 39 :param max_delta: Maximum interval adjustment value that 40 will be added to duplicate indices 41 :type max_delta: float 42 43 Consider the following case where a series needs to be reindexed 44 to a new index (which can be required when different series need to 45 be combined and compared): 46 :: 47 48 import pandas 49 values = [0, 1, 2, 3, 4] 50 index = [0.0, 1.0, 1.0, 6.0, 7.0] 51 series = pandas.Series(values, index=index) 52 new_index = [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 7.0] 53 series.reindex(new_index) 54 55 The above code fails with: 56 :: 57 58 ValueError: cannot reindex from a duplicate axis 59 60 The function :func:`handle_duplicate_axis` changes the duplicate values 61 to 62 :: 63 64 >>> import pandas 65 >>> from trappy.utils import handle_duplicate_index 66 67 >>> values = [0, 1, 2, 3, 4] 68 index = [0.0, 1.0, 1.0, 6.0, 7.0] 69 series = pandas.Series(values, index=index) 70 series = handle_duplicate_index(series) 71 print series.index.values 72 >>> [ 0. 1. 1.000001 6. 7. ] 73 74 """ 75 76 index = data.index 77 new_index = index.values 78 79 dups = index.get_duplicates() 80 81 for dup in dups: 82 # Leave one of the values intact 83 dup_index_left = index.searchsorted(dup, side="left") 84 dup_index_right = index.searchsorted(dup, side="right") - 1 85 num_dups = dup_index_right - dup_index_left + 1 86 87 # Calculate delta that needs to be added to each duplicate 88 # index 89 try: 90 delta = (index[dup_index_right + 1] - dup) / num_dups 91 except IndexError: 92 # dup_index_right + 1 is outside of the series (i.e. the 93 # dup is at the end of the series). 94 delta = max_delta 95 96 # Clamp the maximum delta added to max_delta 97 if delta > max_delta: 98 delta = max_delta 99 100 # Add a delta to the others 101 dup_index_left += 1 102 while dup_index_left <= dup_index_right: 103 new_index[dup_index_left] += delta 104 delta += delta 105 dup_index_left += 1 106 107 return data.reindex(new_index) 108 109# Iterate fast over all rows in a data frame and apply fn 110def apply_callback(df, fn, *kwargs): 111 iters = df.itertuples() 112 event_tuple = iters.next() 113 114 # Column names beginning with underscore will not be preserved in tuples 115 # due to constraints on namedtuple field names, so store mappings from 116 # column name to column number for each trace event. 117 col_idxs = { name: idx for idx, name in enumerate(['Time'] + df.columns.tolist()) } 118 119 while True: 120 if not event_tuple: 121 break 122 event_dict = { col: event_tuple[idx] for col, idx in col_idxs.iteritems() } 123 124 if kwargs: 125 fn(event_dict, kwargs) 126 else: 127 fn(event_dict) 128 129 event_tuple = next(iters, None) 130 131 132def merge_dfs(pr_df, sec_df, pivot): 133 # Keep track of last secondary event 134 pivot_map = {} 135 136 # An array accumating dicts with merged data 137 merged_data = [] 138 def df_fn(data): 139 # Store the latest secondary info 140 if data['Time'][0] == 'secondary': 141 pivot_map[data[pivot]] = data 142 # Get rid of primary/secondary labels 143 data['Time'] = data['Time'][1] 144 return 145 146 # Propogate latest secondary info 147 for key, value in data.iteritems(): 148 if key == pivot: 149 continue 150 # Fast check for if value is nan (faster than np.isnan + try/except) 151 if value != value and pivot_map.has_key(data[pivot]): 152 data[key] = pivot_map[data[pivot]][key] 153 154 # Get rid of primary/secondary labels 155 data['Time'] = data['Time'][1] 156 merged_data.append(data) 157 158 df = pd.concat([pr_df, sec_df], keys=['primary', 'secondary']).sort(columns='__line') 159 apply_callback(df, df_fn) 160 merged_df = pd.DataFrame.from_dict(merged_data) 161 merged_df.set_index('Time', inplace=True) 162 163 return merged_df 164