• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python2
2# Copyright 2020 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import logging
7import time
8import re
9
10from autotest_lib.client.common_lib import error
11
12# Storage types supported
13STORAGE_TYPE_SSD = 'ssd'
14STORAGE_TYPE_NVME = 'nvme'
15STORAGE_TYPE_MMC = 'mmc'
16
17# Storage states supported
18STORAGE_STATE_NORMAL = 'normal'
19STORAGE_STATE_WARNING = 'warning'
20STORAGE_STATE_CRITICAL = 'critical'
21
22BADBLOCK_CHECK_RO = 'RO'
23BADBLOCK_CHECK_RW = 'RW'
24
25
26class StorageError(error.TestFail):
27    """Custom error class to indicate is unsupported or unavailable
28    detect storage info.
29    """
30    pass
31
32
33class ConsoleError(error.TestFail):
34    """Common error class for servod console-back control failures."""
35    pass
36
37
38class StorageStateValidator(object):
39    """Class to detect types and state of the DUT storage.
40
41    The class supporting SSD, NVME and MMC storage types.
42    The state detection and set state as:
43    - normal - drive in a good shape
44    - warning - drive close to the worn out state by any metrics
45    - critical - drive is worn out and has errors
46    """
47
48    def __init__(self, host):
49        """Initialize the storage validator.
50
51        @param host: cros_host object providing console access
52                     for reading the target info.
53
54        @raises ConsoleError: if cannot read info
55        @raises StorageError: if info is not preset
56        """
57        self._host = host
58        self._storage_type = None
59        self._storage_state = None
60        self._info = []
61
62        if not self._host:
63            raise StorageError('Host is not provided')
64
65        self._read_storage_info()
66
67    def _read_storage_info(self):
68        """Reading the storage info from SMART
69
70        The info will be located as collection of lines
71        @raises StorageError: if no info provided or data unavailable
72        """
73        logging.info('Extraction storage info')
74        command = '. /usr/share/misc/storage-info-common.sh; get_storage_info'
75        cmd_result = self._host.run(command, ignore_status=True)
76        if cmd_result.exit_status != 0:
77            raise StorageError('receive error: %s;', cmd_result.stderr)
78
79        if cmd_result.stdout:
80            self._info = cmd_result.stdout.splitlines()
81        if len(self._info) == 0:
82            raise StorageError('Storage info is empty')
83
84    def get_type(self):
85        """Determine the type of the storage on the host.
86
87        @returns storage type (SSD, NVME, MMC)
88
89        @raises StorageError: if type not supported or not determine
90        """
91        if not self._storage_type:
92            self._storage_type = self._get_storage_type()
93        return self._storage_type
94
95    def get_state(self, run_badblocks=None):
96        """Determine the type of the storage on the host.
97
98        @param run_badblocks: string key to run badblock check.
99                                None - check if we can run it
100                                "NOT" - do not run check
101                                "RW" - run read-write if booted from USB
102                                "RO"  - run read-only check
103        @returns storage state (normal|warning|critical)
104
105        @raises StorageError: if type not supported or state cannot
106                            be determine
107        """
108        if not self._storage_state:
109            storage_type = self.get_type()
110            if storage_type == STORAGE_TYPE_SSD:
111                self._storage_state = self._get_state_for_ssd()
112            elif storage_type == STORAGE_TYPE_MMC:
113                self._storage_state = self._get_state_for_mms()
114            elif storage_type == STORAGE_TYPE_NVME:
115                self._storage_state = self._get_state_for_nvme()
116        if (run_badblocks != 'NOT'
117                    and self._storage_state != STORAGE_STATE_CRITICAL
118                    and self._support_health_profile()):
119            # run badblocks if storage not in critical state
120            # if bad block found then mark storage as bad
121            logging.info('Trying run badblocks on device')
122            dhp = self._host.health_profile
123            usb_boot = self._host.is_boot_from_external_device()
124            if run_badblocks is None:
125                if _is_time_to_run_badblocks_ro(dhp):
126                    run_badblocks = BADBLOCK_CHECK_RO
127                # Blocked for now till we confirm that SMART stats is not
128                # detect is before we do.
129                # if usb_boot and _is_time_to_run_badblocks_rw(dhp):
130                #     run_badblocks = BADBLOCK_CHECK_RW
131            logging.debug('run_badblocks=%s', run_badblocks)
132            if usb_boot and run_badblocks == BADBLOCK_CHECK_RW:
133                self._run_read_write_badblocks_check()
134                dhp.refresh_badblocks_rw_run_time()
135                # RO is subclass of RW so update it too
136                dhp.refresh_badblocks_ro_run_time()
137            if run_badblocks == BADBLOCK_CHECK_RO:
138                # SMART stats sometimes is not giving issue if blocks
139                # bad for reading. So we run RO check.
140                self._run_readonly_badblocks_check()
141                dhp.refresh_badblocks_ro_run_time()
142        return self._storage_state
143
144    def _get_storage_type(self):
145        """Read the info to detect type of the storage by patterns"""
146        logging.info('Extraction storage type')
147        # Example "SATA Version is: SATA 3.1, 6.0 Gb/s (current: 6.0 Gb/s)"
148        sata_detect = r"SATA Version is:.*"
149
150        # Example "   Extended CSD rev 1.7 (MMC 5.0)"
151        mmc_detect = r"\s*Extended CSD rev.*MMC (?P<version>\d+.\d+)"
152
153        # Example "SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff)"
154        nvme_detect = r".*NVMe Log .*"
155
156        for line in self._info:
157            if re.match(sata_detect, line):
158                logging.info('Found SATA device')
159                logging.debug('Found line => ' + line)
160                return STORAGE_TYPE_SSD
161
162            m = re.match(mmc_detect, line)
163            if m:
164                version = m.group('version')
165                logging.info('Found eMMC device, version: %s', version)
166                logging.debug('Found line => ' + line)
167                return STORAGE_TYPE_MMC
168
169            if re.match(nvme_detect, line):
170                logging.info('Found NVMe device')
171                logging.debug('Found line => ' + line)
172                return STORAGE_TYPE_NVME
173        raise StorageError('Storage type cannot be detect')
174
175    def _get_state_for_ssd(self):
176        """Read the info to detect state for SSD storage"""
177        logging.info('Extraction metrics for SSD storage')
178        # Field meaning and example line that have failing attribute
179        # https://en.wikipedia.org/wiki/S.M.A.R.T.
180        # ID# ATTRIBUTE_NAME     FLAGS    VALUE WORST THRESH FAIL RAW_VALUE
181        # 184 End-to-End_Error   PO--CK   001   001   097    NOW  135
182        ssd_fail = r"""\s*(?P<param>\S+\s\S+)      # ID and attribute name
183                    \s+[P-][O-][S-][R-][C-][K-] # flags
184                    (\s+\d{3}){3}               # three 3-digits numbers
185                    \s+NOW                      # fail indicator"""
186
187        ssd_relocate_sectors = r"""\s*\d\sReallocated_Sector_Ct
188                    \s*[P-][O-][S-][R-][C-][K-] # flags
189                    \s*(?P<value>\d{3}) # VALUE
190                    \s*(?P<worst>\d{3}) # WORST
191                    \s*(?P<thresh>\d{3})# THRESH
192                    """
193        # future optimizations: read GPL and determine persentage
194        for line in self._info:
195            if re.match(ssd_fail, line):
196                logging.debug('Found fail line => ' + line)
197                return STORAGE_STATE_CRITICAL
198
199            m = re.match(ssd_relocate_sectors, line)
200            if m:
201                logging.info('Found critical line => ' + line)
202                value = int(m.group('value'))
203                # manufacture set default value 100,
204                # if number started to grow then it is time to mark it
205                if value > 100:
206                    return STORAGE_STATE_WARNING
207        return STORAGE_STATE_NORMAL
208
209    def _get_state_for_mms(self):
210        """Read the info to detect state for MMC storage"""
211        logging.debug('Extraction metrics for MMC storage')
212        # Ex:
213        # Device life time type A [DEVICE_LIFE_TIME_EST_TYP_A: 0x01]
214        # 0x00~9 means 0-90% band
215        # 0x0a means 90-100% band
216        # 0x0b means over 100% band
217        mmc_fail_lev = r""".*(?P<param>DEVICE_LIFE_TIME_EST_TYP_.)]?:
218                        0x0(?P<val>\S)""" #life time persentage
219
220        # Ex "Pre EOL information [PRE_EOL_INFO: 0x01]"
221        # 0x00 - not defined
222        # 0x01 - Normal
223        # 0x02 - Warning, consumed 80% of the reserved blocks
224        # 0x03 - Urgent, consumed 90% of the reserved blocks
225        mmc_fail_eol = r".*(?P<param>PRE_EOL_INFO.)]?: 0x0(?P<val>\d)"
226
227        eol_value = 0
228        lev_value = -1
229        for line in self._info:
230            m = re.match(mmc_fail_lev, line)
231            if m:
232                param = m.group('val')
233                logging.debug('Found line for lifetime estimate => ' + line)
234                if 'a' == param:
235                    val = 100
236                elif 'b' == param:
237                    val = 101
238                else:
239                    val = int(param)*10
240                if val > lev_value:
241                    lev_value = val
242                continue
243
244            m = re.match(mmc_fail_eol, line)
245            if m:
246                param = m.group('val')
247                logging.debug('Found line for end-of-life => ' + line)
248                eol_value = int(param)
249                break
250
251        # set state based on end-of-life
252        if eol_value == 3:
253            return STORAGE_STATE_CRITICAL
254        elif eol_value == 2:
255            return STORAGE_STATE_WARNING
256        elif eol_value == 1:
257            return STORAGE_STATE_NORMAL
258
259        # set state based on life of estimates
260        elif lev_value < 90:
261            return STORAGE_STATE_NORMAL
262        elif lev_value < 100:
263            return STORAGE_STATE_WARNING
264        return STORAGE_STATE_CRITICAL
265
266    def _get_state_for_nvme(self):
267        """Read the info to detect state for NVMe storage"""
268        logging.debug('Extraction metrics for NVMe storage')
269        # Ex "Percentage Used:         100%"
270        nvme_fail = r"Percentage Used:\s+(?P<param>(\d{1,3}))%"
271        used_value = -1
272        for line in self._info:
273            m = re.match(nvme_fail, line)
274            if m:
275                param = m.group('param')
276                logging.debug('Found line for usage => ' + line)
277                try:
278                    val = int(param)
279                    used_value = val
280                except ValueError as e:
281                    logging.info('Could not cast: %s to int ', param)
282                break
283
284        if used_value < 91:
285            return STORAGE_STATE_NORMAL
286        # Stop mark device as bad when they reached 100% usage
287        # TODO(otabek) crbug.com/1140507 re-evaluate the max usage
288        return STORAGE_STATE_WARNING
289
290    def _get_device_storage_path(self):
291        """Find and return the path to the device storage.
292
293        Method support detection even when the device booted from USB.
294
295        @returns path to the main device like '/dev/XXXX'
296        """
297        # find the name of device storage
298        cmd = ('. /usr/sbin/write_gpt.sh;'
299               ' . /usr/share/misc/chromeos-common.sh;'
300               ' load_base_vars; get_fixed_dst_drive')
301        cmd_result = self._host.run(cmd,
302                                    ignore_status=True,
303                                    timeout=60)
304        if cmd_result.exit_status != 0:
305            logging.debug('Failed to detect path to the device storage')
306            return None
307        return cmd_result.stdout.strip()
308
309    def _run_readonly_badblocks_check(self):
310        """Run backblocks readonly verification on device storage.
311
312        The blocksize set as 512 based.
313        """
314        path = self._get_device_storage_path()
315        if not path:
316            # cannot continue if storage was not detected
317            return
318        logging.info("Running readonly badblocks check; path=%s", path)
319        cmd = 'badblocks -e 1 -s -b 512 %s' % path
320        try:
321            # set limit in 1 hour but expecting to finish it up 30 minutes
322            cmd_result = self._host.run(cmd, ignore_status=True, timeout=3600)
323            if cmd_result.exit_status != 0:
324                logging.debug('Failed to detect path to the device storage')
325                return
326            result = cmd_result.stdout.strip()
327            if result:
328                logging.debug("Check result: '%s'", result)
329                # So has result is Bad and empty is Good.
330                self._storage_state = STORAGE_STATE_CRITICAL
331        except Exception as e:
332            if 'Timeout encountered:' in str(e):
333                logging.info('Timeout during running action')
334            logging.debug(str(e))
335
336    def _run_read_write_badblocks_check(self):
337        """Run non-destructive read-write check on device storage.
338
339        The blocksize set as 512 based.
340        We can run this test only when DUT booted from USB.
341        """
342        path = self._get_device_storage_path()
343        if not path:
344            # cannot continue if storage was not detected
345            return
346        logging.info("Running read-write badblocks check; path=%s", path)
347        cmd = 'badblocks -e 1 -nsv -b 4096 %s' % path
348        try:
349            # set limit in 90 minutes but expecting to finish it up 50 minutes
350            cmd_result = self._host.run(cmd, ignore_status=True, timeout=5400)
351            if cmd_result.exit_status != 0:
352                logging.debug('Failed to detect path to the device storage')
353                return
354            result = cmd_result.stdout.strip()
355            if result:
356                logging.debug("Check result: '%s'", result)
357                # So has result is Bad and empty is Good.
358                self._storage_state = STORAGE_STATE_CRITICAL
359        except Exception as e:
360            if 'Timeout encountered:' in str(e):
361                logging.info('Timeout during running action')
362            logging.info('(Not critical) %s', e)
363
364    def _support_health_profile(self):
365        return (hasattr(self._host, 'health_profile')
366                and self._host.health_profile)
367
368
369def _is_time_to_run_badblocks_ro(dhp):
370    """Verify that device can proceed to run read-only badblocks check.
371    The RO check can be executed not often then one per 6 days.
372
373    @returns True if can proceed, False if not
374    """
375    today_time = int(time.time())
376    last_check = dhp.get_badblocks_ro_run_time_epoch()
377    can_run = today_time > (last_check + (6 * 24 * 60 * 60))
378    if not can_run:
379        logging.info(
380                'Run RO badblocks not allowed because we have run it recently,'
381                ' last run %s. RO check allowed to run only once per 6 days',
382                dhp.get_badblocks_ro_run_time())
383    return can_run
384
385
386def _is_time_to_run_badblocks_rw(dhp):
387    """Verify that device can proceed to run read-write badblocks check.
388    The RW check can be executed not often then one per 60 days.
389
390    @returns True if can proceed, False if not
391    """
392    today_time = int(time.time())
393    last_check = dhp.get_badblocks_rw_run_time_epoch()
394    can_run = today_time > (last_check + (60 * 24 * 60 * 60))
395    if not can_run:
396        logging.info(
397                'Run RW badblocks not allowed because we have run it recently,'
398                ' last run %s. RW check allowed to run only once per 60 days',
399                dhp.get_badblocks_rw_run_time())
400    return can_run
401