1#!/usr/bin/env python2 2# Copyright 2020 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import logging 7import time 8import re 9 10from autotest_lib.client.common_lib import error 11 12# Storage types supported 13STORAGE_TYPE_SSD = 'ssd' 14STORAGE_TYPE_NVME = 'nvme' 15STORAGE_TYPE_MMC = 'mmc' 16 17# Storage states supported 18STORAGE_STATE_NORMAL = 'normal' 19STORAGE_STATE_WARNING = 'warning' 20STORAGE_STATE_CRITICAL = 'critical' 21 22BADBLOCK_CHECK_RO = 'RO' 23BADBLOCK_CHECK_RW = 'RW' 24 25 26class StorageError(error.TestFail): 27 """Custom error class to indicate is unsupported or unavailable 28 detect storage info. 29 """ 30 pass 31 32 33class ConsoleError(error.TestFail): 34 """Common error class for servod console-back control failures.""" 35 pass 36 37 38class StorageStateValidator(object): 39 """Class to detect types and state of the DUT storage. 40 41 The class supporting SSD, NVME and MMC storage types. 42 The state detection and set state as: 43 - normal - drive in a good shape 44 - warning - drive close to the worn out state by any metrics 45 - critical - drive is worn out and has errors 46 """ 47 48 def __init__(self, host): 49 """Initialize the storage validator. 50 51 @param host: cros_host object providing console access 52 for reading the target info. 53 54 @raises ConsoleError: if cannot read info 55 @raises StorageError: if info is not preset 56 """ 57 self._host = host 58 self._storage_type = None 59 self._storage_state = None 60 self._info = [] 61 62 if not self._host: 63 raise StorageError('Host is not provided') 64 65 self._read_storage_info() 66 67 def _read_storage_info(self): 68 """Reading the storage info from SMART 69 70 The info will be located as collection of lines 71 @raises StorageError: if no info provided or data unavailable 72 """ 73 logging.info('Extraction storage info') 74 command = '. /usr/share/misc/storage-info-common.sh; get_storage_info' 75 cmd_result = self._host.run(command, ignore_status=True) 76 if cmd_result.exit_status != 0: 77 raise StorageError('receive error: %s;', cmd_result.stderr) 78 79 if cmd_result.stdout: 80 self._info = cmd_result.stdout.splitlines() 81 if len(self._info) == 0: 82 raise StorageError('Storage info is empty') 83 84 def get_type(self): 85 """Determine the type of the storage on the host. 86 87 @returns storage type (SSD, NVME, MMC) 88 89 @raises StorageError: if type not supported or not determine 90 """ 91 if not self._storage_type: 92 self._storage_type = self._get_storage_type() 93 return self._storage_type 94 95 def get_state(self, run_badblocks=None): 96 """Determine the type of the storage on the host. 97 98 @param run_badblocks: string key to run badblock check. 99 None - check if we can run it 100 "NOT" - do not run check 101 "RW" - run read-write if booted from USB 102 "RO" - run read-only check 103 @returns storage state (normal|warning|critical) 104 105 @raises StorageError: if type not supported or state cannot 106 be determine 107 """ 108 if not self._storage_state: 109 storage_type = self.get_type() 110 if storage_type == STORAGE_TYPE_SSD: 111 self._storage_state = self._get_state_for_ssd() 112 elif storage_type == STORAGE_TYPE_MMC: 113 self._storage_state = self._get_state_for_mms() 114 elif storage_type == STORAGE_TYPE_NVME: 115 self._storage_state = self._get_state_for_nvme() 116 if (run_badblocks != 'NOT' 117 and self._storage_state != STORAGE_STATE_CRITICAL 118 and self._support_health_profile()): 119 # run badblocks if storage not in critical state 120 # if bad block found then mark storage as bad 121 logging.info('Trying run badblocks on device') 122 dhp = self._host.health_profile 123 usb_boot = self._host.is_boot_from_external_device() 124 if run_badblocks is None: 125 if _is_time_to_run_badblocks_ro(dhp): 126 run_badblocks = BADBLOCK_CHECK_RO 127 # Blocked for now till we confirm that SMART stats is not 128 # detect is before we do. 129 # if usb_boot and _is_time_to_run_badblocks_rw(dhp): 130 # run_badblocks = BADBLOCK_CHECK_RW 131 logging.debug('run_badblocks=%s', run_badblocks) 132 if usb_boot and run_badblocks == BADBLOCK_CHECK_RW: 133 self._run_read_write_badblocks_check() 134 dhp.refresh_badblocks_rw_run_time() 135 # RO is subclass of RW so update it too 136 dhp.refresh_badblocks_ro_run_time() 137 if run_badblocks == BADBLOCK_CHECK_RO: 138 # SMART stats sometimes is not giving issue if blocks 139 # bad for reading. So we run RO check. 140 self._run_readonly_badblocks_check() 141 dhp.refresh_badblocks_ro_run_time() 142 return self._storage_state 143 144 def _get_storage_type(self): 145 """Read the info to detect type of the storage by patterns""" 146 logging.info('Extraction storage type') 147 # Example "SATA Version is: SATA 3.1, 6.0 Gb/s (current: 6.0 Gb/s)" 148 sata_detect = r"SATA Version is:.*" 149 150 # Example " Extended CSD rev 1.7 (MMC 5.0)" 151 mmc_detect = r"\s*Extended CSD rev.*MMC (?P<version>\d+.\d+)" 152 153 # Example "SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff)" 154 nvme_detect = r".*NVMe Log .*" 155 156 for line in self._info: 157 if re.match(sata_detect, line): 158 logging.info('Found SATA device') 159 logging.debug('Found line => ' + line) 160 return STORAGE_TYPE_SSD 161 162 m = re.match(mmc_detect, line) 163 if m: 164 version = m.group('version') 165 logging.info('Found eMMC device, version: %s', version) 166 logging.debug('Found line => ' + line) 167 return STORAGE_TYPE_MMC 168 169 if re.match(nvme_detect, line): 170 logging.info('Found NVMe device') 171 logging.debug('Found line => ' + line) 172 return STORAGE_TYPE_NVME 173 raise StorageError('Storage type cannot be detect') 174 175 def _get_state_for_ssd(self): 176 """Read the info to detect state for SSD storage""" 177 logging.info('Extraction metrics for SSD storage') 178 # Field meaning and example line that have failing attribute 179 # https://en.wikipedia.org/wiki/S.M.A.R.T. 180 # ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE 181 # 184 End-to-End_Error PO--CK 001 001 097 NOW 135 182 ssd_fail = r"""\s*(?P<param>\S+\s\S+) # ID and attribute name 183 \s+[P-][O-][S-][R-][C-][K-] # flags 184 (\s+\d{3}){3} # three 3-digits numbers 185 \s+NOW # fail indicator""" 186 187 ssd_relocate_sectors = r"""\s*\d\sReallocated_Sector_Ct 188 \s*[P-][O-][S-][R-][C-][K-] # flags 189 \s*(?P<value>\d{3}) # VALUE 190 \s*(?P<worst>\d{3}) # WORST 191 \s*(?P<thresh>\d{3})# THRESH 192 """ 193 # future optimizations: read GPL and determine persentage 194 for line in self._info: 195 if re.match(ssd_fail, line): 196 logging.debug('Found fail line => ' + line) 197 return STORAGE_STATE_CRITICAL 198 199 m = re.match(ssd_relocate_sectors, line) 200 if m: 201 logging.info('Found critical line => ' + line) 202 value = int(m.group('value')) 203 # manufacture set default value 100, 204 # if number started to grow then it is time to mark it 205 if value > 100: 206 return STORAGE_STATE_WARNING 207 return STORAGE_STATE_NORMAL 208 209 def _get_state_for_mms(self): 210 """Read the info to detect state for MMC storage""" 211 logging.debug('Extraction metrics for MMC storage') 212 # Ex: 213 # Device life time type A [DEVICE_LIFE_TIME_EST_TYP_A: 0x01] 214 # 0x00~9 means 0-90% band 215 # 0x0a means 90-100% band 216 # 0x0b means over 100% band 217 mmc_fail_lev = r""".*(?P<param>DEVICE_LIFE_TIME_EST_TYP_.)]?: 218 0x0(?P<val>\S)""" #life time persentage 219 220 # Ex "Pre EOL information [PRE_EOL_INFO: 0x01]" 221 # 0x00 - not defined 222 # 0x01 - Normal 223 # 0x02 - Warning, consumed 80% of the reserved blocks 224 # 0x03 - Urgent, consumed 90% of the reserved blocks 225 mmc_fail_eol = r".*(?P<param>PRE_EOL_INFO.)]?: 0x0(?P<val>\d)" 226 227 eol_value = 0 228 lev_value = -1 229 for line in self._info: 230 m = re.match(mmc_fail_lev, line) 231 if m: 232 param = m.group('val') 233 logging.debug('Found line for lifetime estimate => ' + line) 234 if 'a' == param: 235 val = 100 236 elif 'b' == param: 237 val = 101 238 else: 239 val = int(param)*10 240 if val > lev_value: 241 lev_value = val 242 continue 243 244 m = re.match(mmc_fail_eol, line) 245 if m: 246 param = m.group('val') 247 logging.debug('Found line for end-of-life => ' + line) 248 eol_value = int(param) 249 break 250 251 # set state based on end-of-life 252 if eol_value == 3: 253 return STORAGE_STATE_CRITICAL 254 elif eol_value == 2: 255 return STORAGE_STATE_WARNING 256 elif eol_value == 1: 257 return STORAGE_STATE_NORMAL 258 259 # set state based on life of estimates 260 elif lev_value < 90: 261 return STORAGE_STATE_NORMAL 262 elif lev_value < 100: 263 return STORAGE_STATE_WARNING 264 return STORAGE_STATE_CRITICAL 265 266 def _get_state_for_nvme(self): 267 """Read the info to detect state for NVMe storage""" 268 logging.debug('Extraction metrics for NVMe storage') 269 # Ex "Percentage Used: 100%" 270 nvme_fail = r"Percentage Used:\s+(?P<param>(\d{1,3}))%" 271 used_value = -1 272 for line in self._info: 273 m = re.match(nvme_fail, line) 274 if m: 275 param = m.group('param') 276 logging.debug('Found line for usage => ' + line) 277 try: 278 val = int(param) 279 used_value = val 280 except ValueError as e: 281 logging.info('Could not cast: %s to int ', param) 282 break 283 284 if used_value < 91: 285 return STORAGE_STATE_NORMAL 286 # Stop mark device as bad when they reached 100% usage 287 # TODO(otabek) crbug.com/1140507 re-evaluate the max usage 288 return STORAGE_STATE_WARNING 289 290 def _get_device_storage_path(self): 291 """Find and return the path to the device storage. 292 293 Method support detection even when the device booted from USB. 294 295 @returns path to the main device like '/dev/XXXX' 296 """ 297 # find the name of device storage 298 cmd = ('. /usr/sbin/write_gpt.sh;' 299 ' . /usr/share/misc/chromeos-common.sh;' 300 ' load_base_vars; get_fixed_dst_drive') 301 cmd_result = self._host.run(cmd, 302 ignore_status=True, 303 timeout=60) 304 if cmd_result.exit_status != 0: 305 logging.debug('Failed to detect path to the device storage') 306 return None 307 return cmd_result.stdout.strip() 308 309 def _run_readonly_badblocks_check(self): 310 """Run backblocks readonly verification on device storage. 311 312 The blocksize set as 512 based. 313 """ 314 path = self._get_device_storage_path() 315 if not path: 316 # cannot continue if storage was not detected 317 return 318 logging.info("Running readonly badblocks check; path=%s", path) 319 cmd = 'badblocks -e 1 -s -b 512 %s' % path 320 try: 321 # set limit in 1 hour but expecting to finish it up 30 minutes 322 cmd_result = self._host.run(cmd, ignore_status=True, timeout=3600) 323 if cmd_result.exit_status != 0: 324 logging.debug('Failed to detect path to the device storage') 325 return 326 result = cmd_result.stdout.strip() 327 if result: 328 logging.debug("Check result: '%s'", result) 329 # So has result is Bad and empty is Good. 330 self._storage_state = STORAGE_STATE_CRITICAL 331 except Exception as e: 332 if 'Timeout encountered:' in str(e): 333 logging.info('Timeout during running action') 334 logging.debug(str(e)) 335 336 def _run_read_write_badblocks_check(self): 337 """Run non-destructive read-write check on device storage. 338 339 The blocksize set as 512 based. 340 We can run this test only when DUT booted from USB. 341 """ 342 path = self._get_device_storage_path() 343 if not path: 344 # cannot continue if storage was not detected 345 return 346 logging.info("Running read-write badblocks check; path=%s", path) 347 cmd = 'badblocks -e 1 -nsv -b 4096 %s' % path 348 try: 349 # set limit in 90 minutes but expecting to finish it up 50 minutes 350 cmd_result = self._host.run(cmd, ignore_status=True, timeout=5400) 351 if cmd_result.exit_status != 0: 352 logging.debug('Failed to detect path to the device storage') 353 return 354 result = cmd_result.stdout.strip() 355 if result: 356 logging.debug("Check result: '%s'", result) 357 # So has result is Bad and empty is Good. 358 self._storage_state = STORAGE_STATE_CRITICAL 359 except Exception as e: 360 if 'Timeout encountered:' in str(e): 361 logging.info('Timeout during running action') 362 logging.info('(Not critical) %s', e) 363 364 def _support_health_profile(self): 365 return (hasattr(self._host, 'health_profile') 366 and self._host.health_profile) 367 368 369def _is_time_to_run_badblocks_ro(dhp): 370 """Verify that device can proceed to run read-only badblocks check. 371 The RO check can be executed not often then one per 6 days. 372 373 @returns True if can proceed, False if not 374 """ 375 today_time = int(time.time()) 376 last_check = dhp.get_badblocks_ro_run_time_epoch() 377 can_run = today_time > (last_check + (6 * 24 * 60 * 60)) 378 if not can_run: 379 logging.info( 380 'Run RO badblocks not allowed because we have run it recently,' 381 ' last run %s. RO check allowed to run only once per 6 days', 382 dhp.get_badblocks_ro_run_time()) 383 return can_run 384 385 386def _is_time_to_run_badblocks_rw(dhp): 387 """Verify that device can proceed to run read-write badblocks check. 388 The RW check can be executed not often then one per 60 days. 389 390 @returns True if can proceed, False if not 391 """ 392 today_time = int(time.time()) 393 last_check = dhp.get_badblocks_rw_run_time_epoch() 394 can_run = today_time > (last_check + (60 * 24 * 60 * 60)) 395 if not can_run: 396 logging.info( 397 'Run RW badblocks not allowed because we have run it recently,' 398 ' last run %s. RW check allowed to run only once per 60 days', 399 dhp.get_badblocks_rw_run_time()) 400 return can_run 401