1#!/usr/bin/env python2 2# Copyright 2020 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import os 7import copy 8import json 9import time 10import logging 11 12from autotest_lib.server.cros.device_health_profile.profile_constants import * 13 14 15class DeviceHealthProfileError(Exception): 16 """ 17 Generic Exception for failures from DeviceHealthProfile object. 18 """ 19 20 21class InvalidDeviceHealthProfileKeyError(DeviceHealthProfileError): 22 """ 23 Exception to throw when trying to get an invalid health profile key. 24 """ 25 26 27class DeviceHealthProfile(object): 28 """This class provide interfaces to access device health profile 29 that cached on profile host(usually labstation). 30 """ 31 32 def __init__(self, hostname, host_info=None, result_dir=None): 33 """Initialize the class. 34 35 @param hostname: The device hostaname or identification. 36 @param host_info: A HostInfo object of the device of the profile. 37 @param result_dir: A result directory where we can keep local copy of 38 device profile. 39 """ 40 self._hostname = hostname 41 # Cache host-info data 42 self._device_board = host_info.board if host_info else '' 43 self._device_model = host_info.model if host_info else '' 44 # the profile is located on servo-host as temporally location. 45 # The servo-host will be provided later 46 self._profile_host = None 47 self._health_profile = None 48 49 # Construct remote and local file path. 50 profile_filename = self._hostname + '.profile' 51 self._remote_path = os.path.join(PROFILE_FILE_DIR, profile_filename) 52 result_dir = result_dir or '/tmp' 53 self._local_path = os.path.join(result_dir, profile_filename) 54 55 def init_profile(self, profile_host): 56 """Initialize device health profile data. 57 58 If the cached file exists on profile host the method will download 59 file to a local path and read data, otherwise create a profile data 60 from template. 61 62 @param profile_host: An ServoHost object, where is the location 63 we store device health for device. 64 """ 65 if not profile_host: 66 raise DeviceHealthProfileError('The profile host is not provided.') 67 self._profile_host = profile_host 68 # Do a lightweighted check to make sure the machine is up 69 # (by ping), as we don't waste time on unreachable DUT. 70 if not self._profile_host.check_cached_up_status(): 71 raise DeviceHealthProfileError( 72 'The profile host %s is not reachable via ping.' 73 % self._profile_host.hostname) 74 75 # We also want try to check if the DUT is available for ssh. 76 if not self._profile_host.is_up(): 77 raise DeviceHealthProfileError( 78 'The profile host %s is pingable but not sshable.' 79 % self._profile_host.hostname) 80 81 if not self._sync_existing_profile(): 82 self._create_profile_from_template() 83 84 def is_loaded(self): 85 """Check if device profile was loaded on not.""" 86 return self._health_profile is not None 87 88 def _sync_existing_profile(self): 89 """Sync health profile from remote profile host(servohost) and 90 validate profile data is not corrupted or outdated. 91 92 @returns True if sync and validate succeed otherwise False. 93 """ 94 if not self._profile_host.is_file_exists(self._remote_path): 95 logging.debug('%s not exists on %s.', self._remote_path, 96 self._profile_host.hostname) 97 return False 98 self._download_profile() 99 self._read_profile() 100 return self._validate_profile_data(self._health_profile) 101 102 def _download_profile(self): 103 """Copy profile file from remote profile host to local path. 104 """ 105 logging.debug('Downloading profile file from %s:%s to local path: %s', 106 self._profile_host.hostname, 107 self._remote_path, 108 self._local_path) 109 self._profile_host.get_file(source=self._remote_path, 110 dest=self._local_path) 111 112 def _upload_profile(self): 113 """Copy profile file from local path to remote profile host. 114 """ 115 # Make sure the device health profile directory exists on profile host. 116 self._profile_host.run('mkdir -p %s' % PROFILE_FILE_DIR, 117 ignore_status=True) 118 119 logging.debug('Uploading profile from local path: %s to remote %s:%s', 120 self._local_path, 121 self._profile_host.hostname, 122 self._remote_path) 123 self._profile_host.send_file(source=self._local_path, 124 dest=self._remote_path) 125 126 def _read_profile(self): 127 """Read profile data from local path and convert it into json format. 128 """ 129 logging.debug('Reading device health profile from: %s', 130 self._local_path) 131 with open(self._local_path, 'r') as f: 132 try: 133 self._health_profile = json.load(f) 134 except Exception as e: 135 logging.warning('Could not decode %s to json format, the file' 136 ' may be corrupted; %s', 137 self._local_path, str(e)) 138 139 def _dump_profile(self): 140 """Dump profile data into local file. 141 """ 142 logging.debug('Dumping device health profile to: %s', self._local_path) 143 with open(self._local_path, 'w') as f: 144 json.dump(self._health_profile, f) 145 146 def _create_profile_from_template(self): 147 """Create a new health profile dict from template. 148 """ 149 logging.info('Creating new health profile from template for %s.', 150 self._hostname) 151 self._health_profile = copy.deepcopy(DEVICE_HEALTH_PROFILE_TEMPLATE) 152 if self._device_board or self._device_model: 153 self._set_board(self._device_board) 154 self._set_model(self._device_model) 155 self.refresh_update_time() 156 157 def _validate_profile_data(self, data): 158 """Validate the given profile data is in good state. 159 """ 160 logging.debug('Validating health profile data.') 161 if not isinstance(data, dict): 162 logging.debug('Non-dict type detected, the profile data' 163 ' may be corrupted.') 164 return False 165 166 # Validate that cached health profile version is not outdated. 167 input_version = data.get(PROFILE_VERSION_KEY) 168 if input_version != PROFILE_VERSION: 169 logging.info('The input profile version: %s is outdated,' 170 ' expected version: %s', input_version, 171 PROFILE_VERSION) 172 return False 173 174 # Validate that cached board/model is match with device, in case 175 # there is was decom/redeploy. 176 cached_board = data.get(BOARD_KEY) 177 cached_model = data.get(MODEL_KEY) 178 if (self._device_board and cached_board 179 and (self._device_board != cached_board)): 180 logging.info( 181 'The board: %s from host_info does not match board: %s' 182 ' from cached profile, the device hardware probably has' 183 ' been changed.', self._device_board, cached_board) 184 return False 185 if (self._device_model and cached_model 186 and (self._device_model != cached_model)): 187 logging.info( 188 'The model: %s from host_info does not match model: %s' 189 ' from cached profile, the device hardware probably has' 190 ' been changed.', self._device_model, cached_model) 191 return False 192 return True 193 194 def _is_validate_profile_key(self, key): 195 return key in DEVICE_HEALTH_PROFILE_TEMPLATE 196 197 def _update_profile(self, key, value): 198 if not self._is_validate_profile_key(key): 199 logging.info('%s is an invalid health profile key.', key) 200 return 201 logging.debug('Updating health profile key %s to %s', key, value) 202 self._health_profile[key] = value 203 204 def _get_value(self, key): 205 """The basic interface to get a value from health profile dictionary. 206 207 @raises InvalidDeviceHealthProfileKeyError if the input key is 208 not a valid device health profile key. 209 """ 210 if not self._is_validate_profile_key(key): 211 raise InvalidDeviceHealthProfileKeyError( 212 '%s is not a valid device health profile key' % key) 213 return self._health_profile.get(key) 214 215 def _set_board(self, board): 216 # pylint: disable=missing-docstring 217 self._update_profile(BOARD_KEY, board) 218 219 def _set_model(self, model): 220 # pylint: disable=missing-docstring 221 self._update_profile(MODEL_KEY, model) 222 223 @property 224 def health_profile(self): 225 # pylint: disable=missing-docstring 226 return self._health_profile 227 228 def get_board(self): 229 """Get device board from cached device health profile. 230 """ 231 return self._get_value(BOARD_KEY) 232 233 def get_model(self): 234 """Get device model from cached device health profile. 235 """ 236 return self._get_value(MODEL_KEY) 237 238 def get_profile_version(self): 239 """Get the version of cached device health profile. 240 """ 241 return self._get_value(PROFILE_VERSION_KEY) 242 243 def get_dut_state(self): 244 """Get most recent dut state from device health profile. 245 """ 246 return self._get_value(DUT_STATE_KEY) 247 248 def get_servo_state(self): 249 """Get most recent servo state from device health profile. 250 """ 251 return self._get_value(SERVO_STATE_KEY) 252 253 def get_cros_stable_version(self): 254 """Get the most recent used cros image during repair. 255 """ 256 return self._get_value(CROS_STABLE_VERSION_KEY) 257 258 def get_firmware_stable_version(self): 259 """Get the most recent used firmware image during repair, we only 260 expect to see this on non-faft pool device. 261 """ 262 return self._get_value(FIRMWARE_STABLE_VERSION_KEY) 263 264 def get_last_update_time(self): 265 """Get the timestamp of when device health profile file received 266 the most recent updates. Example "2020-01-01 15:05:05" 267 """ 268 return self._get_value(LAST_UPDATE_TIME_KEY) 269 270 def get_last_update_time_epoch(self): 271 """Get the unix time in int of when device health profile file 272 received the most recent updates. 273 """ 274 return int(time.mktime(time.strptime( 275 self.get_last_update_time(), TIME_PATTERN))) 276 277 def get_enter_current_state_time(self): 278 """Get the timestamp of when DUT enter current state. 279 Example "2020-01-01 15:05:05" 280 """ 281 return self._get_value(TIME_ENTER_CURRENT_STATE_KEY) 282 283 def get_enter_current_state_time_epoch(self): 284 """Get the unix time in int of when DUT enter current state. 285 """ 286 return int(time.mktime(time.strptime( 287 self.get_enter_current_state_time(), TIME_PATTERN))) 288 289 def get_repair_fail_count(self): 290 """Get repair fail count since enter current state. 291 """ 292 return self._get_value(REPAIR_FAIL_COUNT_KEY) 293 294 def get_provision_fail_count(self): 295 """Get provision fail count since enter current state. 296 """ 297 return self._get_value(PROVISION_FAIL_COUNT_KEY) 298 299 def get_failed_verifiers(self): 300 """Get all failed verifiers. 301 302 @returns a dict represents all failed verifiers and 303 their fail count. 304 """ 305 return self._get_value(FAILED_VERIFIERS_KEY) 306 307 def get_failed_verifier(self, tag): 308 """Get fail count of a specific verifier. 309 310 @param tag: the short identifier of the verifier. 311 312 @returns the fail count of the specified verifier. 313 """ 314 return self.get_failed_verifiers().get(tag, 0) 315 316 def get_succeed_repair_actions(self): 317 """Get all repair actions that has been applied and succeed. 318 319 @returns a dict represents all succeed repair actions 320 and their success count. 321 """ 322 return self._get_value(SUCCEED_REPAIR_ACTIONS_KEY) 323 324 def get_succeed_repair_action(self, tag): 325 """Get success count of a specific repair action. 326 327 @param tag: the short identifier of the repair action. 328 329 @returns the success count of the specified repair action. 330 """ 331 return self.get_succeed_repair_actions().get(tag, 0) 332 333 def get_failed_repair_actions(self): 334 """Get all repair actions that has been applied and failed. 335 336 @returns a dict represents all failed repair actions 337 and their fail count. 338 """ 339 return self._get_value(FAILED_REPAIR_ACTIONS_KEY) 340 341 def get_failed_repair_action(self, tag): 342 """Get fail count of a specific repair action. 343 344 @param tag: the short identifier of the repair action. 345 346 @returns the failed count of the specified repair action. 347 """ 348 return self.get_failed_repair_actions().get(tag, 0) 349 350 def get_badblocks_ro_run_time(self): 351 """Get the timestamp of when run last read-only badblocks check 352 on the device. Example "2020-01-01 15:05:05" 353 """ 354 last_time = self._get_value(LAST_BADBLOCKS_RO_RUN_TIME_KEY) 355 return last_time or DEFAULT_TIMESTAMP 356 357 def get_badblocks_ro_run_time_epoch(self): 358 """Get the unix time of when run last read-only badblocks check 359 on the device." 360 """ 361 last_time = self.get_badblocks_ro_run_time() 362 return int(time.mktime(time.strptime(last_time, TIME_PATTERN))) 363 364 def get_badblocks_rw_run_time(self): 365 """Get the timestamp of when run last read-write badblocks check 366 on the device. Example "2020-01-01 15:05:05" 367 """ 368 last_time = self._get_value(LAST_BADBLOCKS_RW_RUN_TIME_KEY) 369 return last_time or DEFAULT_TIMESTAMP 370 371 def get_badblocks_rw_run_time_epoch(self): 372 """Get the unix time of when run last read-write badblocks check 373 on the device." 374 """ 375 last_time = self.get_badblocks_rw_run_time() 376 return int(time.mktime(time.strptime(last_time, TIME_PATTERN))) 377 378 def get_servo_micro_fw_update_time(self): 379 """Get the timestamp of when run last fw update for servo_micro. 380 Example "2020-01-01 15:05:05" 381 """ 382 last_time = self._get_value(LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY) 383 return last_time or DEFAULT_TIMESTAMP 384 385 def get_servo_micro_fw_update_time_epoch(self): 386 """Get the unix time of when run last fw update for servo_micro. 387 """ 388 last_time = self.get_servo_micro_fw_update_time() 389 return int(time.mktime(time.strptime(last_time, TIME_PATTERN))) 390 391 def set_cros_stable_version(self, build): 392 """Set the most recent used cros image during repair. 393 """ 394 self._update_profile(CROS_STABLE_VERSION_KEY, build) 395 396 def set_firmware_stable_version(self, build): 397 """Set the most recent used firmware image during repair, we only 398 expect to see this on non-faft pool device. 399 """ 400 self._update_profile(FIRMWARE_STABLE_VERSION_KEY, build) 401 402 def refresh_badblocks_ro_run_time(self): 403 """Get the timestamp of when run last read-only badblocks check 404 on the device. 405 """ 406 return self._update_profile( 407 LAST_BADBLOCKS_RO_RUN_TIME_KEY, 408 time.strftime(TIME_PATTERN, time.localtime())) 409 410 def refresh_badblocks_rw_run_time(self): 411 """Get the timestamp of when run last read-write badblocks check 412 on the device. 413 """ 414 return self._update_profile( 415 LAST_BADBLOCKS_RW_RUN_TIME_KEY, 416 time.strftime(TIME_PATTERN, time.localtime())) 417 418 def refresh_servo_miro_fw_update_run_time(self): 419 """Get the timestamp of when run last fw update for servo_micro. 420 """ 421 return self._update_profile( 422 LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY, 423 time.strftime(TIME_PATTERN, time.localtime())) 424 425 def refresh_update_time(self): 426 """Update last_update_time to current timestamp in UTC. 427 """ 428 self._update_profile(LAST_UPDATE_TIME_KEY, 429 time.strftime(TIME_PATTERN, time.localtime())) 430 431 def increase_repair_fail_count(self): 432 # pylint: disable=missing-docstring 433 self._update_profile(REPAIR_FAIL_COUNT_KEY, 434 self.get_repair_fail_count() + 1) 435 436 def increase_provision_fail_count(self): 437 # pylint: disable=missing-docstring 438 self._update_profile(PROVISION_FAIL_COUNT_KEY, 439 self.get_provision_fail_count() + 1) 440 441 def insert_failed_verifier(self, tag): 442 """Increase fail count for a specific verifier by 1. 443 """ 444 verifiers = self.get_failed_verifiers() 445 if tag not in verifiers: 446 verifiers[tag] = 0 447 verifiers[tag] += 1 448 self._update_profile(FAILED_VERIFIERS_KEY, verifiers) 449 450 def insert_succeed_repair_action(self, tag): 451 """Increase succeed count for a specific repair action by 1. 452 """ 453 actions = self.get_succeed_repair_actions() 454 if tag not in actions: 455 actions[tag] = 0 456 actions[tag] += 1 457 self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, actions) 458 459 def insert_failed_repair_action(self, tag): 460 """Increase fail count for a specific repair action by 1. 461 """ 462 actions = self.get_failed_repair_actions() 463 if tag not in actions: 464 actions[tag] = 0 465 actions[tag] += 1 466 self._update_profile(FAILED_REPAIR_ACTIONS_KEY, actions) 467 468 def update_dut_state(self, state, reset_counters=False): 469 """Update state of the device, this will also reset all fail counts. 470 471 @param state: the new dut state to update. 472 @param reset_counts: a boolean to indicate whether we want to reset 473 all counters. 474 """ 475 if state == self.get_dut_state(): 476 logging.debug('The host is already in %s state.', state) 477 if state == DUT_STATE_REPAIR_FAILED: 478 self.increase_repair_fail_count() 479 return 480 # Reset some records when dut state changes. 481 if reset_counters: 482 self._update_profile(REPAIR_FAIL_COUNT_KEY, 0) 483 self._update_profile(PROVISION_FAIL_COUNT_KEY, 0) 484 self._update_profile(FAILED_VERIFIERS_KEY, {}) 485 self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, {}) 486 self._update_profile(FAILED_REPAIR_ACTIONS_KEY, {}) 487 self._update_profile(TIME_ENTER_CURRENT_STATE_KEY, 488 time.strftime(TIME_PATTERN, time.localtime())) 489 self._update_profile(DUT_STATE_KEY, state) 490 491 def update_servo_state(self, state): 492 # pylint: disable=missing-docstring 493 if state == self.get_servo_state(): 494 logging.debug('The servo is already in %s state.', state) 495 return 496 self._update_profile(SERVO_STATE_KEY, state) 497 498 def close(self): 499 # pylint: disable=missing-docstring 500 self.refresh_update_time() 501 self._dump_profile() 502 self._upload_profile() 503