1 /* 2 * Copyright (C) 2019 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.android.tradefed.device.metric; 17 18 import static com.android.tradefed.util.proto.TfMetricProtoUtil.stringToMetric; 19 20 import com.android.os.AtomsProto.Atom; 21 import com.android.os.StatsLog.EventMetricData; 22 import com.android.os.StatsLog.StatsdStatsReport; 23 import com.android.tradefed.config.OptionClass; 24 import com.android.tradefed.device.DeviceNotAvailableException; 25 import com.android.tradefed.device.ITestDevice; 26 import com.android.tradefed.device.TestDeviceState; 27 import com.android.tradefed.log.LogUtil.CLog; 28 import com.android.tradefed.metrics.proto.MetricMeasurement.Metric; 29 import com.android.tradefed.util.statsd.ConfigUtil; 30 import com.android.tradefed.util.statsd.MetricUtil; 31 32 import com.google.common.annotations.VisibleForTesting; 33 import com.google.common.collect.Iterables; 34 import com.google.protobuf.InvalidProtocolBufferException; 35 36 import java.io.IOException; 37 import java.text.SimpleDateFormat; 38 import java.util.Arrays; 39 import java.util.ArrayList; 40 import java.util.Date; 41 import java.util.HashMap; 42 import java.util.List; 43 import java.util.Map; 44 import java.util.NoSuchElementException; 45 import java.util.concurrent.TimeUnit; 46 import java.util.stream.Collectors; 47 48 /** 49 * Collector that collects timestamps of runtime restarts (system server crashes) during the test 50 * run, if any. 51 * 52 * <p>Outputs results in counts, wall clock time in seconds and in HH:mm:ss format, and system 53 * uptime in nanoseconds and HH:mm:ss format. 54 * 55 * <p>This collector uses two sources for system server crashes: 56 * 57 * <p> 58 * 59 * <ol> 60 * <li>The system_restart_sec list from StatsdStatsReport, which is a rolling list of 20 61 * timestamps when the system server crashes, in seconds, with newer crashes appended to the 62 * end (when the list fills up, older timestamps fall off the beginning). 63 * <li>The AppCrashOccurred statsd atom, where a system server crash shows up as a system_server 64 * process crash (this behavior is documented in the statsd atoms.proto definition). The event 65 * metric gives the device uptime when the crash occurs. 66 * </ol> 67 * 68 * <p>Both can be useful information, as the former makes it easy to correlate timestamps in logs, 69 * while the latter serves as a longevity metric. 70 */ 71 @OptionClass(alias = "runtime-restart-collector") 72 public class RuntimeRestartCollector extends BaseDeviceMetricCollector { 73 private static final String METRIC_SEP = "-"; 74 public static final String METRIC_PREFIX = "runtime-restart"; 75 public static final String METRIC_SUFFIX_COUNT = "count"; 76 public static final String METRIC_SUFFIX_SYSTEM_TIMESTAMP_SECS = "timestamps_secs"; 77 public static final String METRIC_SUFFIX_SYSTEM_TIMESTAMP_FORMATTED = "timestamps_str"; 78 public static final String METRIC_SUFFIX_UPTIME_NANOS = "uptime_nanos"; 79 public static final String METRIC_SUFFIX_UPTIME_FORMATTED = "uptime_str"; 80 public static final SimpleDateFormat TIME_FORMATTER = new SimpleDateFormat("HH:mm:ss"); 81 public static final String SYSTEM_SERVER_KEYWORD = "system_server"; 82 83 private List<ITestDevice> mTestDevices; 84 // Map to store the runtime restart timestamps for each device, keyed by device serial numbers. 85 private Map<String, List<Integer>> mExistingTimestamps = new HashMap<>(); 86 // Map to store the statsd config IDs for each device, keyed by device serial numbers. 87 private Map<String, Long> mDeviceConfigIds = new HashMap<>(); 88 // Flag variable to indicate whether including the device serial in the metrics is needed. 89 // Device serial is included when there are more than one device under test. 90 private boolean mIncludeDeviceSerial = false; 91 92 /** 93 * Store the existing timestamps of system server restarts prior to the test run as statsd keeps 94 * a running log of them, and push the config to collect app crashes. 95 */ 96 @Override onTestRunStart(DeviceMetricData runData)97 public void onTestRunStart(DeviceMetricData runData) throws DeviceNotAvailableException { 98 mTestDevices = getDevices(); 99 mIncludeDeviceSerial = (mTestDevices.size() > 1); 100 for (ITestDevice device : mTestDevices) { 101 // Pull statsd metadata (StatsdStatsReport) from the device and keep track of existing 102 // runtime restart timestamps. 103 try { 104 List<Integer> existingTimestamps = 105 getStatsdMetadata(device).getSystemRestartSecList(); 106 mExistingTimestamps.put(device.getSerialNumber(), existingTimestamps); 107 } catch (InvalidProtocolBufferException e) { 108 // Error is not thrown as we still want to process the other devices. 109 CLog.e( 110 "Failed to get statsd metadata from device %s. Exception: %s.", 111 device.getSerialNumber(), e); 112 continue; 113 } 114 115 // Register statsd config to collect app crashes. 116 try { 117 mDeviceConfigIds.put( 118 device.getSerialNumber(), 119 pushStatsConfig( 120 device, Arrays.asList(Atom.APP_CRASH_OCCURRED_FIELD_NUMBER))); 121 } catch (IOException e) { 122 // Error is not thrown as we still want to push the config to other devices. 123 CLog.e( 124 "Failed to push statsd config to device %s. Exception: %s.", 125 device.getSerialNumber(), e); 126 continue; 127 } 128 } 129 } 130 131 /** 132 * Pull the timestamps at the end of the test run and report the difference with existing ones, 133 * if any. 134 */ 135 @Override onTestRunEnd(DeviceMetricData runData, final Map<String, Metric> currentRunMetrics)136 public void onTestRunEnd(DeviceMetricData runData, final Map<String, Metric> currentRunMetrics) 137 throws DeviceNotAvailableException { 138 for (ITestDevice device : mTestDevices) { 139 if (!TestDeviceState.ONLINE.equals(device.getDeviceState())) { 140 CLog.d( 141 "Device '%s' is in state '%s' skipping RuntimeRestartCollector", 142 device.getSerialNumber(), device.getDeviceState()); 143 continue; 144 } 145 // Pull statsd metadata again to look at the changes of system server crash timestamps 146 // during the test run, and report them. 147 // A copy of the list is created as the list returned by the proto is not modifiable. 148 List<Integer> updatedTimestamps = new ArrayList<>(); 149 try { 150 updatedTimestamps.addAll(getStatsdMetadata(device).getSystemRestartSecList()); 151 } catch (InvalidProtocolBufferException e) { 152 // Error is not thrown as we still want to process the other devices. 153 CLog.e( 154 "Failed to get statsd metadata from device %s. Exception: %s.", 155 device.getSerialNumber(), e); 156 continue; 157 } 158 if (!mExistingTimestamps.containsKey(device.getSerialNumber())) { 159 CLog.e("No prior state recorded for device %s.", device.getSerialNumber()); 160 addStatsdStatsBasedMetrics( 161 currentRunMetrics, updatedTimestamps, device.getSerialNumber()); 162 } else { 163 try { 164 int lastTimestampBeforeTestRun = 165 Iterables.getLast(mExistingTimestamps.get(device.getSerialNumber())); 166 // If the last timestamp is not found, lastIndexOf(...) + 1 returns 0 which is 167 // what is needed in that situation. 168 addStatsdStatsBasedMetrics( 169 currentRunMetrics, 170 getAllValuesAfter(lastTimestampBeforeTestRun, updatedTimestamps), 171 device.getSerialNumber()); 172 } catch (NoSuchElementException e) { 173 // The exception occurs when no prior runtime restarts had been recorded. It is 174 // thrown from Iterables.getLast(). 175 addStatsdStatsBasedMetrics( 176 currentRunMetrics, updatedTimestamps, device.getSerialNumber()); 177 } 178 } 179 } 180 181 // Pull the AppCrashOccurred event metric reports and report the system server crashes 182 // within. 183 for (ITestDevice device : mTestDevices) { 184 if (!mDeviceConfigIds.containsKey(device.getSerialNumber())) { 185 CLog.e("No config ID is associated with device %s.", device.getSerialNumber()); 186 continue; 187 } 188 long configId = mDeviceConfigIds.get(device.getSerialNumber()); 189 List<Long> uptimeListNanos = new ArrayList<>(); 190 List<EventMetricData> metricData = getEventMetricData(device, configId); 191 uptimeListNanos.addAll( 192 metricData.stream() 193 .filter(d -> d.hasElapsedTimestampNanos()) 194 .filter(d -> d.hasAtom()) 195 .filter(d -> d.getAtom().hasAppCrashOccurred()) 196 .filter(d -> d.getAtom().getAppCrashOccurred().hasProcessName()) 197 .filter( 198 d -> 199 SYSTEM_SERVER_KEYWORD.equals( 200 d.getAtom() 201 .getAppCrashOccurred() 202 .getProcessName())) 203 .map(d -> d.getElapsedTimestampNanos()) 204 .collect(Collectors.toList())); 205 addAtomBasedMetrics(currentRunMetrics, uptimeListNanos, device.getSerialNumber()); 206 removeConfig(device, configId); 207 } 208 } 209 210 /** Helper method to add metrics from StatsdStatsReport according to timestamps. */ addStatsdStatsBasedMetrics( final Map<String, Metric> metrics, List<Integer> timestampsSecs, String serial)211 private void addStatsdStatsBasedMetrics( 212 final Map<String, Metric> metrics, List<Integer> timestampsSecs, String serial) { 213 // Always add a count of system server crashes, regardless of whether there are any. 214 // The statsd metadata-based count is used as the atom-based data can miss runtime restart 215 // instances. 216 // TODO(b/135770315): Re-assess this after the root cause for missing instances in the atom 217 // -based results is found. 218 String countMetricKey = createMetricKey(METRIC_SUFFIX_COUNT, serial); 219 metrics.put(countMetricKey, stringToMetric(String.valueOf(timestampsSecs.size()))); 220 // If there are runtime restarts, add a comma-separated list of timestamps. 221 if (!timestampsSecs.isEmpty()) { 222 // Store both the raw timestamp and the formatted, more readable version. 223 224 String timestampMetricKey = 225 createMetricKey(METRIC_SUFFIX_SYSTEM_TIMESTAMP_SECS, serial); 226 metrics.put( 227 timestampMetricKey, 228 stringToMetric( 229 timestampsSecs 230 .stream() 231 .map(t -> String.valueOf(t)) 232 .collect(Collectors.joining(",")))); 233 234 String formattedTimestampMetricKey = 235 createMetricKey(METRIC_SUFFIX_SYSTEM_TIMESTAMP_FORMATTED, serial); 236 metrics.put( 237 formattedTimestampMetricKey, 238 stringToMetric( 239 timestampsSecs 240 .stream() 241 .map(t -> timestampToHoursMinutesSeconds(t)) 242 .collect(Collectors.joining(",")))); 243 } 244 } 245 246 /** Helper method to add metrics from the AppCrashOccurred atoms according to timestamps. */ addAtomBasedMetrics( final Map<String, Metric> metrics, List<Long> timestampsNanos, String serial)247 private void addAtomBasedMetrics( 248 final Map<String, Metric> metrics, List<Long> timestampsNanos, String serial) { 249 // If there are runtime restarts, add a comma-separated list of device uptime timestamps. 250 if (!timestampsNanos.isEmpty()) { 251 // Store both the raw timestamp and the formatted, more readable version. 252 253 String uptimeNanosMetricKey = createMetricKey(METRIC_SUFFIX_UPTIME_NANOS, serial); 254 metrics.put( 255 uptimeNanosMetricKey, 256 stringToMetric( 257 timestampsNanos 258 .stream() 259 .map(t -> String.valueOf(t)) 260 .collect(Collectors.joining(",")))); 261 262 String formattedUptimeMetricKey = 263 createMetricKey(METRIC_SUFFIX_UPTIME_FORMATTED, serial); 264 metrics.put( 265 formattedUptimeMetricKey, 266 stringToMetric( 267 timestampsNanos 268 .stream() 269 .map(t -> nanosToHoursMinutesSeconds(t)) 270 .collect(Collectors.joining(",")))); 271 } 272 } 273 createMetricKey(String suffix, String serial)274 private String createMetricKey(String suffix, String serial) { 275 return mIncludeDeviceSerial 276 ? String.join(METRIC_SEP, METRIC_PREFIX, serial, suffix) 277 : String.join(METRIC_SEP, METRIC_PREFIX, suffix); 278 } 279 280 /** 281 * Forwarding logic to {@link ConfigUtil} for testing as it is a static utility class. 282 * 283 * @hide 284 */ 285 @VisibleForTesting pushStatsConfig(ITestDevice device, List<Integer> eventAtomIds)286 protected long pushStatsConfig(ITestDevice device, List<Integer> eventAtomIds) 287 throws IOException, DeviceNotAvailableException { 288 return ConfigUtil.pushStatsConfig(device, eventAtomIds); 289 } 290 291 /** 292 * Forwarding logic to {@link ConfigUtil} for testing as it is a static utility class. 293 * 294 * @hide 295 */ 296 @VisibleForTesting removeConfig(ITestDevice device, long configId)297 protected void removeConfig(ITestDevice device, long configId) 298 throws DeviceNotAvailableException { 299 ConfigUtil.removeConfig(device, configId); 300 } 301 302 /** 303 * Forwarding logic to {@link MetricUtil} for testing as it is a static utility class. 304 * 305 * @hide 306 */ 307 @VisibleForTesting getStatsdMetadata(ITestDevice device)308 protected StatsdStatsReport getStatsdMetadata(ITestDevice device) 309 throws DeviceNotAvailableException, InvalidProtocolBufferException { 310 return MetricUtil.getStatsdMetadata(device); 311 } 312 313 /** 314 * Forwarding logic to {@link MetricUtil} for testing as it is a static utility class. 315 * 316 * @hide 317 */ 318 @VisibleForTesting getEventMetricData(ITestDevice device, long configId)319 protected List<EventMetricData> getEventMetricData(ITestDevice device, long configId) 320 throws DeviceNotAvailableException { 321 return MetricUtil.getEventMetricData(device, configId); 322 } 323 324 /** 325 * Helper method to convert nanoseconds to HH:mm:ss. {@link SimpleDateFormat} is not used here 326 * as the input is not a real epoch-based timestamp. 327 */ nanosToHoursMinutesSeconds(long nanos)328 private String nanosToHoursMinutesSeconds(long nanos) { 329 long hours = TimeUnit.NANOSECONDS.toHours(nanos); 330 nanos -= TimeUnit.HOURS.toNanos(hours); 331 long minutes = TimeUnit.NANOSECONDS.toMinutes(nanos); 332 nanos -= TimeUnit.MINUTES.toNanos(minutes); 333 long seconds = TimeUnit.NANOSECONDS.toSeconds(nanos); 334 return String.format("%02d:%02d:%02d", hours, minutes, seconds); 335 } 336 337 /** Helper method to format an epoch-based timestamp in seconds to HH:mm:ss. */ timestampToHoursMinutesSeconds(int seconds)338 private String timestampToHoursMinutesSeconds(int seconds) { 339 return TIME_FORMATTER.format(new Date(TimeUnit.SECONDS.toMillis(seconds))); 340 } 341 342 /** Helper method to get all values in a list that appear after all occurrences of target. */ getAllValuesAfter(int target, List<Integer> l)343 private List<Integer> getAllValuesAfter(int target, List<Integer> l) { 344 return l.subList(l.lastIndexOf(target) + 1, l.size()); 345 } 346 } 347