• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python2
2# Copyright 2020 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import os
7import copy
8import json
9import time
10import logging
11
12from autotest_lib.server.cros.device_health_profile.profile_constants import *
13
14
15class DeviceHealthProfileError(Exception):
16    """
17    Generic Exception for failures from DeviceHealthProfile object.
18    """
19
20
21class InvalidDeviceHealthProfileKeyError(DeviceHealthProfileError):
22    """
23    Exception to throw when trying to get an invalid health profile key.
24    """
25
26
27class DeviceHealthProfile(object):
28    """This class provide interfaces to access device health profile
29    that cached on profile host(usually labstation).
30    """
31
32    def __init__(self, hostname, host_info=None, result_dir=None):
33        """Initialize the class.
34
35        @param hostname:    The device hostaname or identification.
36        @param host_info:   A HostInfo object of the device of the profile.
37        @param result_dir:  A result directory where we can keep local copy of
38                            device profile.
39        """
40        self._hostname = hostname
41        # Cache host-info data
42        self._device_board = host_info.board if host_info else ''
43        self._device_model = host_info.model if host_info else ''
44        # the profile is located on servo-host as temporally location.
45        # The servo-host will be provided later
46        self._profile_host = None
47        self._health_profile = None
48
49        # Construct remote and local file path.
50        profile_filename = self._hostname + '.profile'
51        self._remote_path = os.path.join(PROFILE_FILE_DIR, profile_filename)
52        result_dir = result_dir or '/tmp'
53        self._local_path = os.path.join(result_dir, profile_filename)
54
55    def init_profile(self, profile_host):
56        """Initialize device health profile data.
57
58        If the cached file exists on profile host the method will download
59        file to a local path and read data, otherwise create a profile data
60        from template.
61
62        @param profile_host: An ServoHost object, where is the location
63                             we store device health for device.
64        """
65        if not profile_host:
66            raise DeviceHealthProfileError('The profile host is not provided.')
67        self._profile_host = profile_host
68        # Do a lightweighted check to make sure the machine is up
69        # (by ping), as we don't waste time on unreachable DUT.
70        if not self._profile_host.check_cached_up_status():
71            raise DeviceHealthProfileError(
72                'The profile host %s is not reachable via ping.'
73                % self._profile_host.hostname)
74
75        # We also want try to check if the DUT is available for ssh.
76        if not self._profile_host.is_up():
77            raise DeviceHealthProfileError(
78                'The profile host %s is pingable but not sshable.'
79                % self._profile_host.hostname)
80
81        if not self._sync_existing_profile():
82            self._create_profile_from_template()
83
84    def is_loaded(self):
85        """Check if device profile was loaded on not."""
86        return self._health_profile is not None
87
88    def _sync_existing_profile(self):
89        """Sync health profile from remote profile host(servohost) and
90        validate profile data is not corrupted or outdated.
91
92        @returns True if sync and validate succeed otherwise False.
93        """
94        if not self._profile_host.is_file_exists(self._remote_path):
95            logging.debug('%s not exists on %s.', self._remote_path,
96                          self._profile_host.hostname)
97            return False
98        self._download_profile()
99        self._read_profile()
100        return self._validate_profile_data(self._health_profile)
101
102    def _download_profile(self):
103        """Copy profile file from remote profile host to local path.
104        """
105        logging.debug('Downloading profile file from %s:%s to local path: %s',
106                      self._profile_host.hostname,
107                      self._remote_path,
108                      self._local_path)
109        self._profile_host.get_file(source=self._remote_path,
110                                    dest=self._local_path)
111
112    def _upload_profile(self):
113        """Copy profile file from local path to remote profile host.
114        """
115        # Make sure the device health profile directory exists on profile host.
116        self._profile_host.run('mkdir -p %s' % PROFILE_FILE_DIR,
117                               ignore_status=True)
118
119        logging.debug('Uploading profile from local path: %s to remote %s:%s',
120                      self._local_path,
121                      self._profile_host.hostname,
122                      self._remote_path)
123        self._profile_host.send_file(source=self._local_path,
124                                     dest=self._remote_path)
125
126    def _read_profile(self):
127        """Read profile data from local path and convert it into json format.
128        """
129        logging.debug('Reading device health profile from: %s',
130                      self._local_path)
131        with open(self._local_path, 'r') as f:
132            try:
133                self._health_profile = json.load(f)
134            except Exception as e:
135                logging.warning('Could not decode %s to json format, the file'
136                                ' may be corrupted; %s',
137                                self._local_path, str(e))
138
139    def _dump_profile(self):
140        """Dump profile data into local file.
141        """
142        logging.debug('Dumping device health profile to: %s', self._local_path)
143        with open(self._local_path, 'w') as f:
144            json.dump(self._health_profile, f)
145
146    def _create_profile_from_template(self):
147        """Create a new health profile dict from template.
148        """
149        logging.info('Creating new health profile from template for %s.',
150                     self._hostname)
151        self._health_profile = copy.deepcopy(DEVICE_HEALTH_PROFILE_TEMPLATE)
152        if self._device_board or self._device_model:
153            self._set_board(self._device_board)
154            self._set_model(self._device_model)
155        self.refresh_update_time()
156
157    def _validate_profile_data(self, data):
158        """Validate the given profile data is in good state.
159        """
160        logging.debug('Validating health profile data.')
161        if not isinstance(data, dict):
162            logging.debug('Non-dict type detected, the profile data'
163                          ' may be corrupted.')
164            return False
165
166        # Validate that cached health profile version is not outdated.
167        input_version = data.get(PROFILE_VERSION_KEY)
168        if input_version != PROFILE_VERSION:
169            logging.info('The input profile version: %s is outdated,'
170                         ' expected version: %s', input_version,
171                         PROFILE_VERSION)
172            return False
173
174        # Validate that cached board/model is match with device, in case
175        # there is was decom/redeploy.
176        cached_board = data.get(BOARD_KEY)
177        cached_model = data.get(MODEL_KEY)
178        if (self._device_board and cached_board
179                    and (self._device_board != cached_board)):
180            logging.info(
181                    'The board: %s from host_info does not match board: %s'
182                    ' from cached profile, the device hardware probably has'
183                    ' been changed.', self._device_board, cached_board)
184            return False
185        if (self._device_model and cached_model
186                    and (self._device_model != cached_model)):
187            logging.info(
188                    'The model: %s from host_info does not match model: %s'
189                    ' from cached profile, the device hardware probably has'
190                    ' been changed.', self._device_model, cached_model)
191            return False
192        return True
193
194    def _is_validate_profile_key(self, key):
195        return key in DEVICE_HEALTH_PROFILE_TEMPLATE
196
197    def _update_profile(self, key, value):
198        if not self._is_validate_profile_key(key):
199            logging.info('%s is an invalid health profile key.', key)
200            return
201        logging.debug('Updating health profile key %s to %s', key, value)
202        self._health_profile[key] = value
203
204    def _get_value(self, key):
205        """The basic interface to get a value from health profile dictionary.
206
207        @raises InvalidDeviceHealthProfileKeyError if the input key is
208                not a valid device health profile key.
209        """
210        if not self._is_validate_profile_key(key):
211            raise InvalidDeviceHealthProfileKeyError(
212                '%s is not a valid device health profile key' % key)
213        return self._health_profile.get(key)
214
215    def _set_board(self, board):
216        # pylint: disable=missing-docstring
217        self._update_profile(BOARD_KEY, board)
218
219    def _set_model(self, model):
220        # pylint: disable=missing-docstring
221        self._update_profile(MODEL_KEY, model)
222
223    @property
224    def health_profile(self):
225        # pylint: disable=missing-docstring
226        return self._health_profile
227
228    def get_board(self):
229        """Get device board from cached device health profile.
230        """
231        return self._get_value(BOARD_KEY)
232
233    def get_model(self):
234        """Get device model from cached device health profile.
235        """
236        return self._get_value(MODEL_KEY)
237
238    def get_profile_version(self):
239        """Get the version of cached device health profile.
240        """
241        return self._get_value(PROFILE_VERSION_KEY)
242
243    def get_dut_state(self):
244        """Get most recent dut state from device health profile.
245        """
246        return self._get_value(DUT_STATE_KEY)
247
248    def get_servo_state(self):
249        """Get most recent servo state from device health profile.
250        """
251        return self._get_value(SERVO_STATE_KEY)
252
253    def get_cros_stable_version(self):
254        """Get the most recent used cros image during repair.
255        """
256        return self._get_value(CROS_STABLE_VERSION_KEY)
257
258    def get_firmware_stable_version(self):
259        """Get the most recent used firmware image during repair, we only
260        expect to see this on non-faft pool device.
261        """
262        return self._get_value(FIRMWARE_STABLE_VERSION_KEY)
263
264    def get_last_update_time(self):
265        """Get the timestamp of when device health profile file received
266        the most recent updates. Example "2020-01-01 15:05:05"
267        """
268        return self._get_value(LAST_UPDATE_TIME_KEY)
269
270    def get_last_update_time_epoch(self):
271        """Get the unix time in int of when device health profile file
272        received the most recent updates.
273        """
274        return int(time.mktime(time.strptime(
275            self.get_last_update_time(), TIME_PATTERN)))
276
277    def get_enter_current_state_time(self):
278        """Get the timestamp of when DUT enter current state.
279        Example "2020-01-01 15:05:05"
280        """
281        return self._get_value(TIME_ENTER_CURRENT_STATE_KEY)
282
283    def get_enter_current_state_time_epoch(self):
284        """Get the unix time in int of when DUT enter current state.
285        """
286        return int(time.mktime(time.strptime(
287            self.get_enter_current_state_time(), TIME_PATTERN)))
288
289    def get_repair_fail_count(self):
290        """Get repair fail count since enter current state.
291        """
292        return self._get_value(REPAIR_FAIL_COUNT_KEY)
293
294    def get_provision_fail_count(self):
295        """Get provision fail count since enter current state.
296        """
297        return self._get_value(PROVISION_FAIL_COUNT_KEY)
298
299    def get_failed_verifiers(self):
300        """Get all failed verifiers.
301
302        @returns a dict represents all failed verifiers and
303                 their fail count.
304        """
305        return self._get_value(FAILED_VERIFIERS_KEY)
306
307    def get_failed_verifier(self, tag):
308        """Get fail count of a specific verifier.
309
310        @param tag: the short identifier of the verifier.
311
312        @returns the fail count of the specified verifier.
313        """
314        return self.get_failed_verifiers().get(tag, 0)
315
316    def get_succeed_repair_actions(self):
317        """Get all repair actions that has been applied and succeed.
318
319        @returns a dict represents all succeed repair actions
320                 and their success count.
321        """
322        return self._get_value(SUCCEED_REPAIR_ACTIONS_KEY)
323
324    def get_succeed_repair_action(self, tag):
325        """Get success count of a specific repair action.
326
327        @param tag: the short identifier of the repair action.
328
329        @returns the success count of the specified repair action.
330        """
331        return self.get_succeed_repair_actions().get(tag, 0)
332
333    def get_failed_repair_actions(self):
334        """Get all repair actions that has been applied and failed.
335
336        @returns a dict represents all failed repair actions
337                 and their fail count.
338        """
339        return self._get_value(FAILED_REPAIR_ACTIONS_KEY)
340
341    def get_failed_repair_action(self, tag):
342        """Get fail count of a specific repair action.
343
344        @param tag: the short identifier of the repair action.
345
346        @returns the failed count of the specified repair action.
347        """
348        return self.get_failed_repair_actions().get(tag, 0)
349
350    def get_badblocks_ro_run_time(self):
351        """Get the timestamp of when run last read-only badblocks check
352        on the device. Example "2020-01-01 15:05:05"
353        """
354        last_time = self._get_value(LAST_BADBLOCKS_RO_RUN_TIME_KEY)
355        return last_time or DEFAULT_TIMESTAMP
356
357    def get_badblocks_ro_run_time_epoch(self):
358        """Get the unix time of when run last read-only badblocks check
359        on the device."
360        """
361        last_time = self.get_badblocks_ro_run_time()
362        return int(time.mktime(time.strptime(last_time, TIME_PATTERN)))
363
364    def get_badblocks_rw_run_time(self):
365        """Get the timestamp of when run last read-write badblocks check
366        on the device. Example "2020-01-01 15:05:05"
367        """
368        last_time = self._get_value(LAST_BADBLOCKS_RW_RUN_TIME_KEY)
369        return last_time or DEFAULT_TIMESTAMP
370
371    def get_badblocks_rw_run_time_epoch(self):
372        """Get the unix time of when run last read-write badblocks check
373        on the device."
374        """
375        last_time = self.get_badblocks_rw_run_time()
376        return int(time.mktime(time.strptime(last_time, TIME_PATTERN)))
377
378    def get_servo_micro_fw_update_time(self):
379        """Get the timestamp of when run last fw update for servo_micro.
380        Example "2020-01-01 15:05:05"
381        """
382        last_time = self._get_value(LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY)
383        return last_time or DEFAULT_TIMESTAMP
384
385    def get_servo_micro_fw_update_time_epoch(self):
386        """Get the unix time of when run last fw update for servo_micro.
387        """
388        last_time = self.get_servo_micro_fw_update_time()
389        return int(time.mktime(time.strptime(last_time, TIME_PATTERN)))
390
391    def set_cros_stable_version(self, build):
392        """Set the most recent used cros image during repair.
393        """
394        self._update_profile(CROS_STABLE_VERSION_KEY, build)
395
396    def set_firmware_stable_version(self, build):
397        """Set the most recent used firmware image during repair, we only
398        expect to see this on non-faft pool device.
399        """
400        self._update_profile(FIRMWARE_STABLE_VERSION_KEY, build)
401
402    def refresh_badblocks_ro_run_time(self):
403        """Get the timestamp of when run last read-only badblocks check
404        on the device.
405        """
406        return self._update_profile(
407                LAST_BADBLOCKS_RO_RUN_TIME_KEY,
408                time.strftime(TIME_PATTERN, time.localtime()))
409
410    def refresh_badblocks_rw_run_time(self):
411        """Get the timestamp of when run last read-write badblocks check
412        on the device.
413        """
414        return self._update_profile(
415                LAST_BADBLOCKS_RW_RUN_TIME_KEY,
416                time.strftime(TIME_PATTERN, time.localtime()))
417
418    def refresh_servo_miro_fw_update_run_time(self):
419        """Get the timestamp of when run last fw update for servo_micro.
420        """
421        return self._update_profile(
422                LAST_SERVO_MICRO_FW_UPDATE_RUN_TIME_KEY,
423                time.strftime(TIME_PATTERN, time.localtime()))
424
425    def refresh_update_time(self):
426        """Update last_update_time to current timestamp in UTC.
427        """
428        self._update_profile(LAST_UPDATE_TIME_KEY,
429                             time.strftime(TIME_PATTERN, time.localtime()))
430
431    def increase_repair_fail_count(self):
432        # pylint: disable=missing-docstring
433        self._update_profile(REPAIR_FAIL_COUNT_KEY,
434                             self.get_repair_fail_count() + 1)
435
436    def increase_provision_fail_count(self):
437        # pylint: disable=missing-docstring
438        self._update_profile(PROVISION_FAIL_COUNT_KEY,
439                             self.get_provision_fail_count() + 1)
440
441    def insert_failed_verifier(self, tag):
442        """Increase fail count for a specific verifier by 1.
443        """
444        verifiers = self.get_failed_verifiers()
445        if tag not in verifiers:
446            verifiers[tag] = 0
447        verifiers[tag] += 1
448        self._update_profile(FAILED_VERIFIERS_KEY, verifiers)
449
450    def insert_succeed_repair_action(self, tag):
451        """Increase succeed count for a specific repair action by 1.
452        """
453        actions = self.get_succeed_repair_actions()
454        if tag not in actions:
455            actions[tag] = 0
456        actions[tag] += 1
457        self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, actions)
458
459    def insert_failed_repair_action(self, tag):
460        """Increase fail count for a specific repair action by 1.
461        """
462        actions = self.get_failed_repair_actions()
463        if tag not in actions:
464            actions[tag] = 0
465        actions[tag] += 1
466        self._update_profile(FAILED_REPAIR_ACTIONS_KEY, actions)
467
468    def update_dut_state(self, state, reset_counters=False):
469        """Update state of the device, this will also reset all fail counts.
470
471        @param state: the new dut state to update.
472        @param reset_counts: a boolean to indicate whether we want to reset
473                             all counters.
474        """
475        if state == self.get_dut_state():
476            logging.debug('The host is already in %s state.', state)
477            if state == DUT_STATE_REPAIR_FAILED:
478                self.increase_repair_fail_count()
479            return
480        # Reset some records when dut state changes.
481        if reset_counters:
482            self._update_profile(REPAIR_FAIL_COUNT_KEY, 0)
483            self._update_profile(PROVISION_FAIL_COUNT_KEY, 0)
484            self._update_profile(FAILED_VERIFIERS_KEY, {})
485            self._update_profile(SUCCEED_REPAIR_ACTIONS_KEY, {})
486            self._update_profile(FAILED_REPAIR_ACTIONS_KEY, {})
487        self._update_profile(TIME_ENTER_CURRENT_STATE_KEY,
488                             time.strftime(TIME_PATTERN, time.localtime()))
489        self._update_profile(DUT_STATE_KEY, state)
490
491    def update_servo_state(self, state):
492        # pylint: disable=missing-docstring
493        if state == self.get_servo_state():
494            logging.debug('The servo is already in %s state.', state)
495            return
496        self._update_profile(SERVO_STATE_KEY, state)
497
498    def close(self):
499        # pylint: disable=missing-docstring
500        self.refresh_update_time()
501        self._dump_profile()
502        self._upload_profile()
503