1#! /bin/bash 2# 3# Stress test driver for Linux MCA High Level Handlers 4# 5# This program is free software; you can redistribute it and/or 6# modify it under the terms of the GNU General Public 7# License as published by the Free Software Foundation; version 8# 2. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13# General Public License for more details. 14# 15# You should find a copy of v2 of the GNU General Public License somewhere 16# on your Linux system; if not, write to the Free Software Foundation, 17# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18# 19# Copyright (C) 2009, Intel Corp. 20# Author: Haicheng Li <haicheng.li@intel.com> 21# 22 23#set -x 24sd=$(dirname "$0") 25export ROOT=`(cd $sd/..; pwd)` 26 27. $ROOT/lib/mce.sh 28 29DEBUG=0 30 31silent_exec() 32{ 33 local cmd=$@ 34 35 if [ $DEBUG -eq 0 ]; then 36 $cmd > /dev/null 2>&1 37 else 38 $cmd 39 fi 40 return $? 41} 42 43silent_exec_background() 44{ 45 local cmd=$@ 46 47 if [ $DEBUG -eq 0 ]; then 48 $cmd > /dev/null 2>&1 & 49 else 50 $cmd & 51 fi 52 return $? 53} 54 55_print() 56{ 57 echo $* > $g_tty 58} 59 60dbp() 61{ 62 [ $DEBUG -ne 1 ] && return 63 _print -en "\\033[0;33m" # set font color as yellow 64 _print "[debug] $*" > $g_tty 65 echo "[debug] $*" >> $g_logfile 66 _print -en "\\033[0;39m" # restore font color to normal 67} 68 69log() 70{ 71 _print -en "\\033[0;33m" # set font color as yellow 72 _print "[info] $*" > $g_tty 73 echo "[info] $*" >> $g_logfile 74 _print -en "\\033[0;39m" # restore font color to normal 75} 76 77begin() 78{ 79 _print -n "$*" > $g_tty 80 _print -en "\\033[0;32m" # set font color as green 81 _print -e "\t [start]" > $g_tty 82 echo -e "$* \t [start]" >> $g_logfile 83 _print -en "\\033[0;39m" # restore font color to normal 84} 85 86end() 87{ 88 _print -n "$*" > $g_tty 89 _print -en "\\033[0;32m" # set font color as green 90 _print -e "\t [done]" > $g_tty 91 echo -e "$* \t [done]" >> $g_logfile 92 _print -en "\\033[0;39m" # restore font color to normal 93} 94 95err() 96{ 97 _print -en "\\033[0;31m" # set font color as red 98 echo > $g_tty 99 echo "Test aborted by unexpected error!" > $g_tty 100 _print "[error] !!! $* !!!" > $g_tty 101 echo > $g_tty 102 echo "Test aborted by unexpected error!" >> $g_result 103 echo "[error] !!! $* !!!" >> $g_result 104 echo "[error] !!! $* !!!" >> $g_logfile 105 _print -en "\\033[0;39m" # restore font color to normal 106 exit 1 107} 108 109die() 110{ 111 err $@ 112} 113 114invalid() 115{ 116 _print -en "\\033[0;31m" # set font color as red 117 echo > $g_tty 118 echo "Test aborted by unexpected error!" > $g_tty 119 _print "[error] !!! $* !!!" > $g_tty 120 echo > $g_tty 121 echo "Try \`./hwposion -h\` for more information." > $g_tty 122 echo > $g_tty 123 echo "Test aborted by unexpected error!" >> $g_result 124 echo "[error] !!! $* !!!" >> $g_result 125 echo "[error] !!! $* !!!" >> $g_logfile 126 _print -en "\\033[0;39m" # restore font color to normal 127 exit 1 128} 129 130result() 131{ 132 _print -en "\\033[0;34m" # set font color as blue 133 _print -e "$*" > $g_tty 134 echo -e "$*" >> $g_result 135 echo -e "$*" >> $g_logfile 136 _print -en "\\033[0;39m" # restore font color to normal 137} 138 139setup_meminfo() 140{ 141 local maxmem=0 142 local lowmem_s=0 143 local lowmem_e=0 144 local highmem_s=0 145 local highmem_e=0 146 local tmp= 147 148 lowmem_s=`printf "%i" 0x100000` # start pfn of mem < 4G 149 let "g_lowmem_s=$lowmem_s / $g_pgsize" 150 tmp=`cat /proc/iomem | grep "System RAM" | grep 100000- | awk -F "-" '{print $2}' | awk '{print $1}'` 151 lowmem_e=`printf "%i" "0x$tmp"` 152 let "g_lowmem_e=$lowmem_e / $g_pgsize" 153 log "low mem: 0x100000 (pfn: $g_lowmem_s) ~ 0x$tmp (pfn: $g_lowmem_e)" 154 155 highmem_s=`printf "%i" 0x100000000` # start pfn of highmem > 4G 156 let "g_highmem_s=$highmem_s / $g_pgsize" 157 tmp=`cat /proc/iomem | grep "System RAM" | grep 100000000- | awk -F "-" '{print $2}' | awk '{print $1}'` 158 if [ -n "$tmp" ]; then 159 highmem_e=`printf "%i" "0x$tmp"` 160 let "g_highmem_e=$highmem_e / $g_pgsize" 161 log "high mem: 0x100000000 (pfn: $g_highmem_s) ~ 0x$tmp (pfn: $g_highmem_e)" 162 fi 163 164 maxmem=`cat /proc/meminfo | grep MemTotal | awk '{print $2}'` 165 let "g_maxpfn= $maxmem / 4" 166 log "max pfn number: g_maxpfn = $g_maxpfn" 167} 168 169setup_errinj() 170{ 171 local dev_major= 172 local dev_minor= 173 local rc=0 174 175 if [ $g_soft_offline -eq 1 ]; then 176 [ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-enable 177 return 178 fi 179 if [ $g_madvise -eq 1 ]; then 180 [ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-enable 181 # to avoid unexpected page-state changing in background while testing. 182 echo 70 > /proc/sys/vm/dirty_background_ratio 183 echo 70 > /proc/sys/vm/dirty_ratio 184 echo 1000000 > /proc/sys/vm/dirty_expire_centisecs 185 return 186 fi 187 dev_major=0x`/usr/bin/stat --format=%t $g_dev` > /dev/null 2>&1 188 [ $? -ne 0 ] && rc=1 189 dev_minor=0x`/usr/bin/stat --format=%T $g_dev` > /dev/null 2>&1 190 [ $? -ne 0 ] && rc=1 191 [ $rc -eq 1 ] && invalid "invalid device: no inode # can be found" 192 echo $dev_major > $g_debugfs/hwpoison/corrupt-filter-dev-major 193 echo $dev_minor > $g_debugfs/hwpoison/corrupt-filter-dev-minor 194 [ $g_pgtype = "all" -a -f "$g_debugfs/hwpoison/corrupt-filter-flags-mask" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-flags-mask 195 [ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 1 > $g_debugfs/hwpoison/corrupt-filter-enable 196 return 197} 198 199setup_fs() 200{ 201 local mkfs="mkfs.$g_fstype" 202 local mkfs_opts="-q" 203 local mount_opts 204 205 [ $g_fstype = reiserfs ] && mkfs="mkreiserfs" 206 [ $g_fstype = ocfs2 ] && mkfs_opts="$mkfs_opts -M local" 207 [ $g_fstype = cifs ] && mount_opts="-o password=""" 208 mkdir -p $g_testdir || err "cannot mkdir $g_testdir" 209 if [ $g_nomkfs -eq 0 -a $g_netfs -eq 0 ]; then 210 silent_exec which $mkfs || err "mkfs: unsupported fstype: $g_fstype" 211 if [ $g_force -eq 0 -a $g_fstype != "ocfs2" ]; then 212 echo -n "test will format $g_dev to $g_fstype, continue [y/n]? " 213 read in 214 [ $in = 'y' -o $in = "yes" -o $in = 'Y' ] || err "$mkfs on $g_dev is cancelled" 215 fi 216 begin "-- $mkfs $g_dev" 217 if [ $g_fstype = "vfat" -o $g_fstype = "msdos" -o $g_fstype = "btrfs" ]; then 218 mkfs_opts="" 219 elif [ $g_fstype = "xfs" ]; then 220 mkfs_opts="-f" 221 fi 222 [ $g_fstype = ocfs2 ] && echo -n "test will format $g_dev to $g_fstype, continue [y/n]? " 223 silent_exec $mkfs $mkfs_opts $g_dev || err "cannot $mkfs $mkfs_opts on $g_dev" 224 end "-- $mkfs $g_dev" 225 fi 226 if [ $g_netfs -eq 0 ]; then 227 silent_exec mount -t $g_fstype $g_dev $g_testdir || err "cannot mount $g_fstype fs: $g_dev to $g_testdir" 228 else 229 silent_exec mount -t $g_fstype $mount_opts $g_netdev $g_testdir || err "cannot mount $g_fstype $mount_opts fs: $g_netdev to $g_testdir" 230 fi 231} 232 233check_env() 234{ 235 check_debugfs 236 g_debugfs=`mount | grep debugfs | cut -d ' ' -f3` 237 [ -z "$g_tty" ] && invalid "$g_tty does not exist" 238 if [ $g_test -eq 0 ]; then 239 if [ $g_fstype = "nfs" -o $g_fstype = "cifs" ]; then 240 g_netfs=1 241 [ -z $g_netdev ] && invalid "net device is not specified" 242 fi 243 [ -z "$g_dev" ] && invalid "device is not specified" 244 [ -b $g_dev ] || invalid "invalid device: $g_dev" 245 if [ $g_netfs -eq 0 ]; then 246 df | grep $g_dev > /dev/null 2>&1 && invalid "device $g_dev has been mounted by others" 247 else 248 df | grep $g_netdev > /dev/null 2>&1 && invalid "device $g_netdev has been mounted by others" 249 fi 250 fi 251 [ -d $g_bindir ] || invalid "no bin subdir there" 252 if [ $g_madvise -eq 0 -o $g_recycle -ne 0 ]; then 253 silent_exec which $g_pagetool || invalid "no $g_pagetool tool on the system" 254 g_pagetool=`which $g_pagetool` 255 dbp "Found the tool: $g_pagetool" 256 fi 257 if [ $g_pfninj -eq 1 ]; then 258 if [ $g_soft_offline -eq 1 ]; then 259 [ -f $g_sysfs_mem/soft_offline_page ] || invalid "pls. ensure soft_offline_page is enabled" 260 else 261 #if hwpoison_inject is a module, it is ensured to have been loaded 262 modinfo hwpoison_inject > /dev/null 2>&1 263 if [ $? -eq 0 ]; then 264 [ -d $g_debugfs/hwpoison/ ] || modprobe hwpoison_inject 265 [ $? -eq 0 ] || invalid "module hwpoison_inject isn't supported ?" 266 fi 267 fi 268 fi 269 [ $g_recycle -ne 0 ] && { 270 [ -f $g_debugfs/hwpoison/unpoison-pfn ] || invalid "pls. insmod hwpoison_inject module with unpoison-pfn support" 271 } 272 if [ $g_apei -eq 1 ]; then 273 #if einj is a module, it is ensured to have been loaded 274 modinfo einj > /dev/null 2>&1 275 if [ $? -eq 0 ]; then 276 [ -d $g_debugfs/apei/einj ] || modprobe einj 277 [ $? -eq 0 ] || invalid "module apei_inj isn't supported ?" 278 fi 279 fi 280 [ -d $g_ltproot -a -f $g_ltppan ] || invalid "no ltp-pan on the machine: $g_ltppan" 281 if [ $g_runltp -eq 1 ]; then 282 [ -d $g_ltproot -a -f $g_ltproot/runltp ] || invalid "no runltp on the machine" 283 fi 284 [ $g_duration -eq 0 ] && invalid "test duration is set as 0 second" 285} 286 287setup_log() 288{ 289 mkdir -p $g_resultdir 290 rm -rf $g_logdir 291 mkdir -p $g_logdir 292 echo "# hwpoison.sh $g_parameter" > $g_logfile 293 echo "# hwpoison.sh $g_parameter" > $g_result 294 [ $g_test -eq 0 ] && clear > $g_tty 295 echo "# hwpoison.sh $g_parameter" > $g_tty 296} 297 298setup_env() 299{ 300 begin "setup test environment" 301 mkdir -p $g_casedir 302 check_env 303 setup_errinj 304 setup_meminfo 305 trap "cleanup" 0 306 [ $g_test -eq 0 ] && setup_fs 307 export PATH="${PATH}:$g_bindir" 308 end "setup test environment" 309} 310 311run_ltp() 312{ 313 local ltp_failed=$g_logdir/ltp/ltp_failed 314 local ltp_log=$g_logdir/ltp/ltp_log 315 local ltp_output=$g_logdir/ltp/ltp_output 316 local ltp_tmp=$g_testdir/ltp_tmp 317 318 begin "launch ltp workload in background" 319 mkdir -p $g_logdir/ltp 320 echo -n "" > $ltp_failed 321 echo -n "" > $ltp_log 322 echo -n "" > $ltp_output 323 mkdir -p $ltp_tmp 324 silent_exec_background $g_ltproot/runltp -d $ltp_tmp -l $ltp_log -o $ltp_output -r $g_ltproot -t ${g_duration}s -C $ltp_failed 325 g_pid_ltp=$! 326 end "launch ltp workload in background (pid: $g_pid_ltp)" 327} 328 329ltp_result() 330{ 331 local num=0; 332 local ltp_failed=$g_logdir/ltp/ltp_failed 333 local ltp_output=$g_logdir/ltp/ltp_output 334 335 [ -f $ltp_failed ] || { 336 result "\tltp -- error: no ltp result there" 337 result "\t log: $ltp_output" 338 g_failed=`expr $g_failed + 1` 339 return 340 } 341 num=`wc -l $ltp_failed | awk '{print $1}'` 342 if [ $num -ne 0 ]; then 343 result "\tltp -- $num case(s) failed" 344 result "\t log: $ltp_output" 345 g_failed=`expr $g_failed + 1` 346 else 347 result "\tltp -- all tests pass" 348 fi 349} 350 351 352fs_metadata() 353{ 354 local dir=$g_logdir/fs_metadata 355 local result=$dir/fs_metadata.result 356 local log=$dir/fs_metadata.log 357 local pan_log=$dir/pan_log 358 local pan_output=$dir/pan_output 359 local pan_zoo=$dir/pan_zoo 360 local pan_failed=$dir/pan_failed 361 local tmp=$g_testdir/fs_metadata 362 local threads= 363 local node_number=5 364 local tree_depth=6 365 366 if [ $g_children -eq 0 ]; then 367 let "threads= $g_duration / 720" 368 else 369 threads=$g_children 370 fi 371 [ $threads -gt 10 ] && threads=10 && node_number=6 372 [ $threads -eq 0 ] && threads=1 373 374 begin "launch fs_metadata workload" 375 mkdir -p $dir 376 echo -n "" > $pan_failed 377 echo -n "" > $pan_log 378 echo -n "" > $pan_output 379 echo -n "" > $pan_zoo 380 log "setup fs_metadata test environment" 381 silent_exec_background rm -rf $tmp 382 mkdir -p $tmp || err "cannot create dir: $tmp" 383 384 echo "fs_metadata fs-metadata.sh $tree_depth $node_number $threads $g_duration $result $tmp $log" > $g_casedir/fs_metadata 385 dbp "g_ltppan -n fs_metadata -a $pan_zoo -f $g_casedir/fs_metadata -o $pan_output -l $pan_log -C $pan_failed &" 386 silent_exec_background $g_ltppan -n fs_metadata -a $pan_zoo -f $g_casedir/fs_metadata -o $pan_output -l $pan_log -C $pan_failed 387 g_pid_fsmeta=$! 388 sleep $g_interval 389 silent_exec grep "abort" $log && err "failed to launch fs_metadata workload, it might be due to insufficient disk space, pls read $log for details!" 390 end "launch fs_metadata workload (pid: $g_pid_fsmeta)" 391} 392 393fs_metadata_result() 394{ 395 local fail_num=0; 396 local pass_num=0; 397 local dir=$g_logdir/fs_metadata 398 local result=$dir/fs_metadata.result 399 local log=$dir/fs_metadata.log 400 401 [ -f $result ] || { 402 result "\tfs_metadata -- error: no result there" 403 result "\t details: $log" 404 g_failed=`expr $g_failed + 1` 405 return 406 } 407 fail_num=`grep FAIL $result | awk -F : '{print $NF}'` 408 pass_num=`grep PASS $result | awk -F : '{print $NF}'` 409 [ -z "$fail_num" ] && fail_num=0 && pass_num=0 410 if [ $fail_num -ne 0 ]; then 411 result "\tfs_metadata -- $fail_num tests failed, $pass_num tests pass." 412 result "\t details: $result" 413 g_failed=`expr $g_failed + 1` 414 else 415 if [ $pass_num -eq 0 ]; then 416 result "\tfs_metadata -- no test finished" 417 result "\t details: $log" 418 g_failed=`expr $g_failed + 1` 419 else 420 result "\tfs_metadata -- all $pass_num tests got pass" 421 fi 422 fi 423 424 return 425} 426 427# fs_specific workload, TBD 428fs_specific() 429{ 430 begin "launch $g_fstype specific workload" 431 432 touch $g_logdir/fs_specific 433# $g_ltppan -n fs_specific -a $g_logdir/fs_specific -f $g_casedir/fs_specific -t ${g_duration}s & 434 end "launch $g_fstype specific workload" 435} 436 437page_poisoning() 438{ 439 local dir=$g_logdir/page_poisoning 440 local pan_failed=$dir/pan_failed 441 local pan_log=$dir/pan_log 442 local pan_output=$dir/pan_output 443 local tmp=$g_testdir/page_poisoning 444 local pan_zoo=$dir/pan_zoo 445 local result=$dir/page_poisoning.result 446 local log=$dir/page_poisoning.log 447 local opts= 448 449 begin "-- launch page_poisoning test" 450 mkdir -p $dir 451 echo -n "" > $pan_failed 452 echo -n "" > $pan_log 453 echo -n "" > $pan_output 454 echo -n "" > $pan_zoo 455 echo -n "" > $log 456 echo -n "" > $result 457 mkdir -p $tmp || err "cannot create dir: $tmp" 458 459 [ $g_children -ne 0 ] && opts="-i $g_children" 460 461 echo "page_poisoning page-poisoning -l $log -r $result -t $tmp $opts" > $g_casedir/page_poisoning 462 dbp "$g_ltppan -n page_poisoning -a $pan_zoo -f $g_casedir/page_poisoning -t ${g_duration}s -o $pan_output -l $pan_log -C $pan_failed &" 463 silent_exec_background $g_ltppan -n page_poisoning -a $pan_zoo -f $g_casedir/page_poisoning -t ${g_duration}s -o $pan_output -l $pan_log -C $pan_failed 464 g_pid_madv=$! 465 end "-- launch page_poisoning test (pid: $g_pid_madv)" 466} 467 468page_poisoning_result() 469{ 470 local fail_num=0 471 local pass_num=0 472 local dir=$g_logdir/page_poisoning 473 local result=$dir/page_poisoning.result 474 local log=$dir/page_poisoning.log 475 476 [ -f $result ] || { 477 result "\tpage_poisoning -- error: no result file there" 478 result "\t details: $log" 479 g_failed=`expr $g_failed + 1` 480 return 481 } 482 fail_num=`grep FAILED $result | wc -l | awk '{print $1}'` 483 pass_num=`grep PASS $result | wc -l | awk '{print $1}'` 484 if [ $fail_num -ne 0 ]; then 485 result "\tpage_poisoning -- $fail_num tests failed, $pass_num tests pass." 486 result "\t details: $result" 487 g_failed=`expr $g_failed + 1` 488 else 489 if [ $pass_num -eq 0 ]; then 490 result "\tpage_poisoning -- no case finished" 491 result "\t details: $log" 492 g_failed=`expr $g_failed + 1` 493 else 494 result "\tpage_poisoning -- all $pass_num tests got pass" 495 fi 496 fi 497 498 return 499} 500 501run_workloads() 502{ 503 fs_metadata 504 #fs_specific 505 return 506} 507 508_pfn_unpoison() 509{ 510 local pg=$1 511 512 echo $pg > $g_debugfs/hwpoison/unpoison-pfn 513 dbp "echo $pg > $g_debugfs/hwpoison/unpoison-pfn" 514} 515 516pfn_unpoison() 517{ 518 local pg_list= 519 local pg=0 520 local pfn=0 521 local cur= 522 local i=0 523 local inj=_pfn_unpoison 524 525 pg_list=`$g_pagetool -NLrb hwpoison | grep -v offset | cut -f1` 526 for pg in $pg_list 527 do 528 $inj 0x$pg > /dev/null 2>&1 529 done 530} 531 532show_progress() 533{ 534 local cur= 535 local rest=0 536 local percent=0 537 local next=0 538 local msg="hwpoison page error injection" 539 540 [ $g_soft_offline -eq 1 ] && msg="page soft offline" 541 542 cur=`date +%s` 543 [ "$cur" -ge "$g_time_e" ] && return 544 rest=`expr $g_time_e - $cur` 545 let "percent= ($g_duration - $rest) * 100 / $g_duration" 546 [ $percent -eq 0 ] && return 547 if [ $g_recycle -ne 0 ]; then 548 let "g_last=(($percent-$g_percent)*$g_duration)+$g_last" 549 [ $g_last -ge $g_recycle ] && { 550 g_last=0 551 pfn_unpoison 552 } 553 fi 554 [ $percent -gt 10 ] && let "next= $percent - 10" 555 [ $g_percent -ne 0 -a $g_percent -gt $next ] && return 556 g_percent=$percent 557 log "$msg: $g_percent% pages done" 558} 559 560_pfn_hwpoison() 561{ 562 local pfn=$1 563 564 echo $pfn > $g_debugfs/hwpoison/corrupt-pfn 565 dbp "echo $pfn > $g_debugfs/hwpoison/corrupt-pfn" 566} 567 568_pfn_soft_offline() 569{ 570 local pfn=$1 571 local i 572 local j 573 local paddr 574 575 i=`printf "%i" $pfn` 576 let "j=$i * $g_pgsize" 577 paddr=`printf "0x%x" $j` 578 echo $paddr > $g_sysfs_mem/soft_offline_page 579 dbp "echo $paddr > $g_sysfs_mem/soft_offline_page" 580} 581 582pfn_inj() 583{ 584 local pg_list= 585 local pg=0 586 local pfn=0 587 local cur= 588 local i=0 589 local inj=_pfn_hwpoison 590 591 [ $g_soft_offline -eq 1 ] && inj=_pfn_soft_offline 592 if [ $g_pgtype = "all" ]; then 593 pfn=$g_lowmem_s # start from 1M. 594 while [ "$pfn" -lt "$g_maxpfn" ] 595 do 596 pg=`printf "%x" $pfn` 597 $inj 0x$pg > /dev/null 2>&1 598 pfn=`expr $pfn + 1` 599 [ $pfn -gt $g_lowmem_e ] && pfn=$g_highmem_s 600 [ $pfn -gt $g_highmem_e ] && break 601 i=`expr $i + 1` 602 if [ $i -eq $g_progress ]; then 603 cur=`date +%s` 604 [ "$cur" -ge "$g_time_e" ] && break 605 show_progress 606 i=0 607 fi 608 done 609 else 610 silent_exec $g_pagetool -Nrb $g_pgtype || err "unsupported pagetype, pls. refer to command: $g_pagetool -h" 611 pg_list=`$g_pagetool -NLrb $g_pgtype | grep -v offset | cut -f1` 612 for pg in $pg_list 613 do 614 $inj 0x$pg > /dev/null 2>&1 615 i=`expr $i + 1` 616 if [ $i -eq $g_progress ]; then 617 cur=`date +%s` 618 [ "$cur" -ge "$g_time_e" ] && break 619 show_progress 620 i=0 621 fi 622 done 623 fi 624} 625 626_apei_inj() 627{ 628 local pfn=`printf "%x" $1` 629 local type=$2 630 631 echo $type > $g_debugfs/apei/einj/error_type 632 echo "0x${pfn}000" > $g_debugfs/apei/err_inj/error_address 633 echo "1" > $g_debugfs/apei/einj/error_inject 634} 635 636apei_ewb_ucr() 637{ 638 _apei_inj $1 0x2 639} 640 641apei_mem_ucr() 642{ 643 _apei_inj $1 0x10 644} 645 646apei_inj() 647{ 648 local pg_list= 649 local pg= 650 local cur= 651 local i=0 652 653 pg_list=`$g_pagetool -NLrb $g_pgtype | grep -v offset | cut -f1` 654 for pg in $pg_list 655 do 656 apei_mem_ucr $pg 657 i=`expr $i + 1` 658 if [ $i -eq $g_progress ]; then 659 cur=`date +%s` 660 [ "$cur" -ge "$g_time_e" ] && break 661 show_progress 662 i=0 663 fi 664 done 665 666 return 667} 668 669err_inject() 670{ 671 local cur= 672 local i=0 673 local msg="hwpoison page error injection" 674 local MSG="inject HWPOISON error to pages" 675 676 if [ $g_soft_offline -eq 1 ]; then 677 msg="page soft offline" 678 MSG="soft OFFLINE pages" 679 fi 680 if [ $g_madvise -eq 1 ]; then 681 begin "$MSG thru madvise syscall" 682 else 683 begin "$MSG ($g_pgtype)" 684 fi 685 let "g_progress=$g_duration * 10" 686 g_time_s=`date +%s` 687 g_time_e=`expr $g_time_s + $g_duration` 688 cur=$g_time_s 689 if [ $g_madvise -eq 1 ]; then 690 page_poisoning 691 log "$msg: 0% pages done" 692 show_progress 693 else 694 log "$msg: 0% pages done" 695 fi 696 while [ "$cur" -lt "$g_time_e" ] 697 do 698 if [ $g_madvise -eq 0 ]; then 699 show_progress 700 [ $g_apei -eq 1 ] && apei_inj 701 [ $g_pfninj -eq 1 ] && pfn_inj 702 else 703 if [ $i -eq $g_progress ]; then 704 show_progress 705 i=0 706 fi 707 i=`expr $i + 1` 708 fi 709 cur=`date +%s` 710 done 711 log "$msg: 100% pages done" 712 # wait workloads to be finished. 713 sleep $g_interval 714 715 if [ $g_madvise -eq 1 ]; then 716 end "$MSG thru madvise syscall" 717 else 718 end "$MSG ($g_pgtype)" 719 fi 720} 721 722fsck_err() 723{ 724 local dir=$g_logdir/fsck 725 local result=$dir/fsck.result 726 local log=$dir/fsck.log 727 728 echo "FAILED: $@" > $result 729 echo "FAILED: $@" > $log 730} 731 732fsck_pass() 733{ 734 local dir=$g_logdir/fsck 735 local result=$dir/fsck.result 736 local log=$dir/fsck.log 737 738 echo "PASS: $@" > $result 739 echo "PASS: $@" > $log 740} 741 742run_fsck() 743{ 744 local dir=$g_logdir/fsck 745 local result=$dir/fsck.result 746 local log=$dir/fsck.log 747 local fsck=fsck.$g_fstype 748 local opts="" 749 750 mkdir -p $dir 751 echo -n "" > $log 752 echo -n "" > $result 753 754 [ $g_fstype = "btrfs" ] && fsck="btrfsck" 755 [ $g_fstype = "reiserfs" ] && { 756 fsck="reiserfsck" 757 opts="-y" 758 } 759 begin "launch $fsck on $g_dev to check test result" 760 silent_exec which $fsck || { 761 fsck_err "fsck: unsupported fstype: $g_fstype" 762 return 763 } 764 fs_sync 765 silent_exec umount -f $g_dev || sleep $g_interval 766 df | grep $g_dev > /dev/null 2>&1 767 if [ $? -eq 0 ]; then 768 silent_exec umount $g_dev || { 769 fsck_err "cannot umount $g_dev to do $fsck" 770 return 771 } 772 fi 773 $fsck $opts $g_dev || fsck_err "err #$? while $fsck on $g_dev" 774 silent_exec mount -t $g_fstype $g_dev $g_testdir || { 775 fsck_err "cannot mount $g_testdir back after fsck_check" 776 return 777 } 778 fsck_pass "$fsck got pass on $g_dev" 779 end "launch $fsck on $g_dev to check test result" 780} 781 782fsck_result() 783{ 784 local dir=$g_logdir/fsck 785 local result=$dir/fsck.result 786 local log=$dir/fsck.log 787 local fail_num=0; 788 local pass_num=0; 789 [ -f $result ] || { 790 result "\tfsck.$g_fstype -- no result found" 791 result "\t details: $log" 792 g_failed=`expr $g_failed + 1` 793 return 794 } 795 796 fail_num=`grep FAILED $result | wc -l | awk '{print $1}'` 797 pass_num=`grep PASS $result | wc -l | awk '{print $1}'` 798 if [ $fail_num -ne 0 ]; then 799 result "\tfsck.$g_fstype -- failed" 800 result "\t log: $log" 801 g_failed=`expr $g_failed + 1` 802 else 803 if [ $pass_num -eq 0 ]; then 804 result "\tfsck.$g_fstype -- not executed" 805 result "\t log: $log" 806 g_failed=`expr $g_failed + 1` 807 else 808 result "\tfsck.$g_fstype -- fsck on $g_dev got pass" 809 fi 810 fi 811} 812 813result_check() 814{ 815 begin "-- collecting test result" 816 result "#############################################" 817 result "result summary:" 818 if [ $g_madvise -eq 1 ]; then 819 page_poisoning_result 820 else 821 fs_metadata_result 822 [ $g_runltp -eq 1 ] && ltp_result 823 fi 824 [ $g_netfs -eq 0 -a $g_test -eq 0 ] && fsck_result 825 result "" 826 result "totally $g_failed task-groups report failures" 827 result "#############################################" 828 end "-- collecting test result" 829} 830 831usage() 832{ 833 echo "Usage: ./hwpoison.sh -d /dev/device [-options] [arguments]" 834 echo 835 echo "Stress Testing for Linux MCA High Level Handlers: " 836 echo -e "\t-c console\t: target tty console to print test log" 837 echo -e "\t-d device\t: target block device to run test on" 838 echo -e "\t-f fstype\t: filesystem type to be tested" 839 echo -e "\t-i interval\t: sleep interval (default is $g_interval seconds)" 840 echo -e "\t-l logfile\t: log file" 841 echo -e "\t-n netdev\t: target network disk to run test on" 842 echo -e "\t-o ltproot\t: ltp root directory (default is $g_ltproot/)" 843 echo -e "\t-p pagetype\t: page type to inject error " 844 echo -e "\t-r result\t: result file" 845 echo -e "\t-s pagesize\t: page size on the system (default is $g_pgsize bytes)" 846 echo -e "\t-t duration\t: test duration time (default is $g_duration seconds)" 847 echo -e "\t-A \t\t: use APEI to inject error" 848 echo -e "\t-C children\t: process num of workloads" 849 echo -e "\t-F \t\t: execute as force mode, no interaction with user" 850 echo -e "\t-L \t\t: run ltp in background" 851 echo -e "\t-M \t\t: run page_poisoning test thru madvise syscall" 852 echo -e "\t-N \t\t: do not mkfs target block device" 853 echo -e "\t-R recyle\t: automatically unpoison pages after running recyle seconds" 854 echo -e "\t-S \t\t: test soft page offline" 855 echo -e "\t-T \t\t: test mode, run test in local dir other than on target device" 856 echo -e "\t-V \t\t: verbose mode, show debug info" 857 echo -e "\t-h \t\t: print this page" 858 echo 859 echo -e "device:" 860 echo -e "\tthis is a mandatory argument. typically, it's a disk partition." 861 echo -e "\tall temporary files will be created on this device." 862 echo -e "\terror injector will just inject errors to the pages associated" 863 echo -e "\twith this device (except for the testing thru madvise syscall)." 864 echo 865 echo -e "pagetype:" 866 echo -e "\tdefault page type:" 867 echo -e "\t $g_pgtype" 868 echo -e "\tfor more details, pls. try \`page-types -h\`." 869 echo -e "\tsee the definition of \"bits-spec\"." 870 echo 871 echo -e "console:" 872 echo -e "\ttest can print output to the console you specified." 873 echo -e "\te.g. '-c /dev/tty1'" 874 echo 875 876 exit 0 877} 878 879fs_sync() 880{ 881 log "now to sync up the disk under testing, might need several minutes ..." 882 sync 883} 884 885stop_children() 886{ 887 begin "-- cleaning up remaining tasks in background" 888 if [ -n "$g_pid_madv" ]; then 889 silent_exec ps $g_pid_madv 890 [ $? -eq 0 ] && { 891 kill -15 $g_pid_madv > /dev/null 2>&1 892 sleep $g_interval 893 } 894 fi 895 if [ -n "$g_pid_fsmeta" ]; then 896 silent_exec ps $g_pid_fsmeta 897 [ $? -eq 0 ] && { 898 kill -15 $g_pid_fsmeta > /dev/null 2>&1 899 sleep $g_interval 900 } 901 fi 902 if [ -n "$g_pid_ltp" ]; then 903 silent_exec ps $g_pid_ltp 904 [ $? -eq 0 ] && { 905 kill -15 $g_pid_ltp > /dev/null 2>&1 906 sleep $g_interval 907 } 908 fi 909 end "-- cleaning up remaining tasks in background" 910} 911 912cleanup() 913{ 914 log "!!! EXIT signal received, need to exit testing now. !!!" 915 begin "preparing to complete testing" 916 stop_children 917 fs_sync 918 result_check 919 if [ $g_netfs -eq 0 ]; then 920 df | grep $g_dev > /dev/null 2>&1 && silent_exec umount -f $g_dev 921 else 922 df | grep $g_netdev > /dev/null 2>&1 && silent_exec umount -f $g_netdev 923 fi 924 if [ $g_madvise -eq 1 ]; then 925 echo $g_vm_dirty_background_ratio > /proc/sys/vm/dirty_background_ratio 926 echo $g_vm_dirty_ratio > /proc/sys/vm/dirty_ratio 927 echo $g_vm_dirty_expire_centisecs > /proc/sys/vm/dirty_expire_centisecs 928 fi 929 end "preparing to complete testing" 930 log "!!! Linux HWPOISON stress testing DONE !!!" 931 log "result: $g_result" 932 log "log: $g_logfile" 933 if [ $g_failed -ne 0 ]; then 934 exit 1 935 else 936 exit 0 937 fi 938} 939 940select_injector() 941{ 942# for test mode, apei injector is not supported. 943 if [ $g_test -eq 1 ]; then 944 [ $g_apei -eq 1 ] && g_apei=0 945 if [ $g_madvise -eq 1 ]; then 946 g_pfninj=0 947 else 948 g_soft_offline=1 949 fi 950 fi 951 952# for non-test mode, apei injector is 1st priority. 953 if [ $g_apei -eq 1 ]; then 954 g_pfninj=0 955 g_madvise=0 956 fi 957 958 if [ $g_madvise -eq 1 ]; then 959 g_pfninj=0 960 fi 961} 962 963g_children=0 # child process num for each workload. 964 # 0 means using default child process num of each workload. 965g_dev= 966g_debugfs= 967g_netdev= 968g_fstype=ext3 969g_netfs=0 970g_nomkfs=0 971g_force=0 972let "g_duration=120" 973g_interval=5 974g_runltp=0 975g_ltproot="/ltp" 976g_ltppan="$g_ltproot/pan/ltp-pan" 977g_pagetool="page-types" 978g_madvise=0 979g_apei=0 980g_pfninj=1 981g_rootdir=`pwd` 982g_bindir=$g_rootdir/bin 983g_casedir=$g_rootdir/runtest 984g_logdir=$g_rootdir/log 985g_testdir=$g_rootdir/hwpoison 986g_resultdir=$g_rootdir/result 987g_logfile=$g_resultdir/hwpoison.log 988g_result=$g_resultdir/hwpoison.result 989g_failed=0 990g_time_s= 991g_time_e= 992g_tty=`tty` 993g_pid_madv= 994g_pid_fsmeta= 995g_pid_ltp= 996g_progress= 997g_percent=0 998g_pgtype="lru,referenced,readahead,swapcache,swapbacked,anonymous" 999g_pgsize=4096 # page size on the system 1000g_maxpfn= # maxpfn on the system 1001g_highmem_s= # start pfn of highmem 1002g_highmem_e= # end pfn of highmem 1003g_lowmem_s= # start pfn of mem < 4G 1004g_lowmem_e= # end pfn of mem < 4G 1005g_sysfs_mem="/sys/devices/system/memory" 1006g_soft_offline=0 1007g_test=0 1008 1009# recyle poisoned page 1010g_recycle=0 1011g_last=0 1012 1013# madvise injector specific global variable 1014g_vm_dirty_background_ratio=`cat /proc/sys/vm/dirty_background_ratio` 1015g_vm_dirty_ratio=`cat /proc/sys/vm/dirty_ratio` 1016g_vm_dirty_expire_centisecs=`cat /proc/sys/vm/dirty_expire_centisecs` 1017 1018# test parameters 1019g_parameter=$@ 1020 1021while getopts ":c:d:f:hi:l:n:o:p:r:s:t:C:LMR:STAFNV" option 1022do 1023 case $option in 1024 c) g_tty=$OPTARG;; 1025 d) g_dev=$OPTARG;; 1026 f) g_fstype=$OPTARG;; 1027 l) g_logfile=$OPTARG;; 1028 t) g_duration=$OPTARG;; 1029 i) g_interval=$OPTARG;; 1030 n) g_netdev=$OPTARG;; 1031 o) g_ltproot=$OPTARG 1032 g_ltppan="$g_ltproot/pan/ltp-pan";; 1033 p) g_pgtype=$OPTARG;; 1034 s) g_pgsize=$OPTARG;; 1035 r) g_result=$OPTARG;; 1036 C) g_children=$OPTARG;; 1037 L) g_runltp=1;; 1038 M) g_madvise=1;; 1039 R) g_recycle=$OPTARG;; 1040 S) g_soft_offline=1;; 1041 T) g_test=1;; 1042 A) g_apei=1;; 1043 F) g_force=1;; 1044 N) g_nomkfs=1;; 1045 V) DEBUG=1;; 1046 h) usage;; 1047 *) invalid "invalid option";; 1048 esac 1049done 1050 1051select_injector 1052setup_log 1053log "!!! Linux HWPOISON stress testing starts NOW !!!" 1054log "!!! test will run about $g_duration seconds !!!" 1055setup_env 1056if [ $g_madvise -eq 0 ]; then 1057 [ $g_runltp -eq 1 ] && run_ltp 1058 run_workloads 1059fi 1060err_inject 1061[ $g_netfs -eq 0 -a $g_test -eq 0 ] && run_fsck 1062