• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /bin/bash
2#
3# Stress test driver for Linux MCA High Level Handlers
4#
5# This program is free software; you can redistribute it and/or
6# modify it under the terms of the GNU General Public
7# License as published by the Free Software Foundation; version
8# 2.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13# General Public License for more details.
14#
15# You should find a copy of v2 of the GNU General Public License somewhere
16# on your Linux system; if not, write to the Free Software Foundation,
17# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18#
19# Copyright (C) 2009, Intel Corp.
20# Author: Haicheng Li <haicheng.li@intel.com>
21#
22
23#set -x
24sd=$(dirname "$0")
25export ROOT=`(cd $sd/..; pwd)`
26
27. $ROOT/lib/mce.sh
28
29DEBUG=0
30
31silent_exec()
32{
33	local cmd=$@
34
35	if [ $DEBUG -eq 0 ]; then
36		$cmd > /dev/null 2>&1
37	else
38		$cmd
39	fi
40	return $?
41}
42
43silent_exec_background()
44{
45	local cmd=$@
46
47	if [ $DEBUG -eq 0 ]; then
48		$cmd > /dev/null 2>&1 &
49	else
50		$cmd &
51	fi
52	return $?
53}
54
55_print()
56{
57	echo $* > $g_tty
58}
59
60dbp()
61{
62	[ $DEBUG -ne 1 ] && return
63	_print -en "\\033[0;33m" # set font color as yellow
64	_print "[debug] $*" > $g_tty
65	echo "[debug] $*" >> $g_logfile
66	_print -en "\\033[0;39m"    # restore font color to normal
67}
68
69log()
70{
71	_print -en "\\033[0;33m" # set font color as yellow
72	_print "[info] $*" > $g_tty
73	echo "[info] $*" >> $g_logfile
74	_print -en "\\033[0;39m"    # restore font color to normal
75}
76
77begin()
78{
79	_print -n "$*" > $g_tty
80	_print -en "\\033[0;32m" # set font color as green
81	_print -e "\t [start]" > $g_tty
82	echo -e "$* \t [start]" >> $g_logfile
83	_print -en "\\033[0;39m"    # restore font color to normal
84}
85
86end()
87{
88	_print -n "$*" > $g_tty
89	_print -en "\\033[0;32m" # set font color as green
90	_print -e "\t [done]" > $g_tty
91	echo -e "$* \t [done]" >> $g_logfile
92	_print -en "\\033[0;39m"    # restore font color to normal
93}
94
95err()
96{
97	_print -en "\\033[0;31m" # set font color as red
98	echo > $g_tty
99	echo "Test aborted by unexpected error!" > $g_tty
100	_print "[error] !!! $* !!!" > $g_tty
101	echo > $g_tty
102	echo "Test aborted by unexpected error!" >> $g_result
103	echo "[error] !!! $* !!!" >> $g_result
104	echo "[error] !!! $* !!!" >> $g_logfile
105	_print -en "\\033[0;39m"    # restore font color to normal
106	exit 1
107}
108
109die()
110{
111	err $@
112}
113
114invalid()
115{
116	_print -en "\\033[0;31m" # set font color as red
117	echo > $g_tty
118	echo "Test aborted by unexpected error!" > $g_tty
119	_print "[error] !!! $* !!!" > $g_tty
120	echo > $g_tty
121	echo "Try \`./hwposion -h\` for more information." > $g_tty
122	echo > $g_tty
123	echo "Test aborted by unexpected error!" >> $g_result
124	echo "[error] !!! $* !!!" >> $g_result
125	echo "[error] !!! $* !!!" >> $g_logfile
126	_print -en "\\033[0;39m"    # restore font color to normal
127	exit 1
128}
129
130result()
131{
132	_print -en "\\033[0;34m" # set font color as blue
133	_print -e "$*" > $g_tty
134	echo -e "$*" >> $g_result
135	echo -e "$*" >> $g_logfile
136	_print -en "\\033[0;39m"    # restore font color to normal
137}
138
139setup_meminfo()
140{
141	local maxmem=0
142	local lowmem_s=0
143	local lowmem_e=0
144	local highmem_s=0
145	local highmem_e=0
146	local tmp=
147
148	lowmem_s=`printf "%i" 0x100000`	# start pfn of mem < 4G
149	let "g_lowmem_s=$lowmem_s / $g_pgsize"
150	tmp=`cat /proc/iomem | grep  "System RAM" | grep 100000- | awk -F "-" '{print $2}' | awk '{print $1}'`
151	lowmem_e=`printf "%i" "0x$tmp"`
152	let "g_lowmem_e=$lowmem_e / $g_pgsize"
153	log "low mem: 0x100000 (pfn: $g_lowmem_s) ~ 0x$tmp (pfn: $g_lowmem_e)"
154
155	highmem_s=`printf "%i" 0x100000000`	# start pfn of highmem > 4G
156	let "g_highmem_s=$highmem_s / $g_pgsize"
157	tmp=`cat /proc/iomem | grep  "System RAM" | grep 100000000- | awk -F "-" '{print $2}' | awk '{print $1}'`
158	if [ -n "$tmp" ]; then
159		highmem_e=`printf "%i" "0x$tmp"`
160		let "g_highmem_e=$highmem_e / $g_pgsize"
161		log "high mem: 0x100000000 (pfn: $g_highmem_s) ~ 0x$tmp (pfn: $g_highmem_e)"
162	fi
163
164	maxmem=`cat /proc/meminfo | grep MemTotal | awk '{print $2}'`
165	let "g_maxpfn= $maxmem / 4"
166	log "max pfn number: g_maxpfn = $g_maxpfn"
167}
168
169setup_errinj()
170{
171	local dev_major=
172	local dev_minor=
173	local rc=0
174
175	if [ $g_soft_offline -eq 1 ]; then
176	        [ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-enable
177	        return
178	fi
179	if [ $g_madvise -eq 1 ]; then
180		[ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-enable
181		# to avoid unexpected page-state changing in background while testing.
182		echo 70 > /proc/sys/vm/dirty_background_ratio
183		echo 70 > /proc/sys/vm/dirty_ratio
184		echo 1000000 > /proc/sys/vm/dirty_expire_centisecs
185		return
186	fi
187	dev_major=0x`/usr/bin/stat --format=%t $g_dev` > /dev/null 2>&1
188	[ $? -ne 0 ] && rc=1
189	dev_minor=0x`/usr/bin/stat --format=%T $g_dev` > /dev/null 2>&1
190	[ $? -ne 0 ] && rc=1
191	[ $rc -eq 1 ] && invalid "invalid device: no inode # can be found"
192	echo $dev_major > $g_debugfs/hwpoison/corrupt-filter-dev-major
193	echo $dev_minor > $g_debugfs/hwpoison/corrupt-filter-dev-minor
194	[ $g_pgtype = "all" -a -f "$g_debugfs/hwpoison/corrupt-filter-flags-mask" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-flags-mask
195	[ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 1 > $g_debugfs/hwpoison/corrupt-filter-enable
196	return
197}
198
199setup_fs()
200{
201	local mkfs="mkfs.$g_fstype"
202	local mkfs_opts="-q"
203	local mount_opts
204
205	[ $g_fstype = reiserfs ] && mkfs="mkreiserfs"
206	[ $g_fstype = ocfs2 ] && mkfs_opts="$mkfs_opts -M local"
207	[ $g_fstype = cifs ] && mount_opts="-o password="""
208	mkdir -p $g_testdir || err "cannot mkdir $g_testdir"
209	if [ $g_nomkfs -eq 0 -a $g_netfs -eq 0 ]; then
210		silent_exec which $mkfs || err "mkfs: unsupported fstype: $g_fstype"
211		if [ $g_force -eq 0 -a $g_fstype != "ocfs2" ]; then
212			echo -n "test will format $g_dev to $g_fstype, continue [y/n]? "
213			read in
214			[ $in = 'y' -o $in = "yes" -o $in = 'Y' ] || err "$mkfs on $g_dev is cancelled"
215		fi
216		begin "-- $mkfs $g_dev"
217		if [ $g_fstype = "vfat" -o $g_fstype = "msdos" -o $g_fstype = "btrfs" ]; then
218			mkfs_opts=""
219		elif [ $g_fstype = "xfs" ]; then
220			mkfs_opts="-f"
221		fi
222		[ $g_fstype = ocfs2 ] && echo -n "test will format $g_dev to $g_fstype, continue [y/n]? "
223		silent_exec $mkfs $mkfs_opts $g_dev || err "cannot $mkfs $mkfs_opts on $g_dev"
224		end "-- $mkfs $g_dev"
225	fi
226	if [ $g_netfs -eq 0 ]; then
227		silent_exec mount -t $g_fstype $g_dev $g_testdir || err "cannot mount $g_fstype fs: $g_dev to $g_testdir"
228	else
229		silent_exec mount -t $g_fstype $mount_opts $g_netdev $g_testdir || err "cannot mount $g_fstype $mount_opts fs: $g_netdev to $g_testdir"
230	fi
231}
232
233check_env()
234{
235	check_debugfs
236	g_debugfs=`mount | grep debugfs | cut -d ' ' -f3`
237	[ -z "$g_tty" ] && invalid "$g_tty does not exist"
238	if [ $g_test -eq 0 ]; then
239		if [ $g_fstype = "nfs" -o $g_fstype = "cifs" ]; then
240			g_netfs=1
241			[ -z $g_netdev ] && invalid "net device is not specified"
242		fi
243		[ -z "$g_dev" ] && invalid "device is not specified"
244		[ -b $g_dev ] || invalid "invalid device: $g_dev"
245		if [ $g_netfs -eq 0 ]; then
246			df | grep $g_dev > /dev/null 2>&1 && invalid "device $g_dev has been mounted by others"
247		else
248			df | grep $g_netdev > /dev/null 2>&1 && invalid "device $g_netdev has been mounted by others"
249		fi
250	fi
251	[ -d $g_bindir ] || invalid "no bin subdir there"
252	if [ $g_madvise -eq 0 -o $g_recycle -ne 0 ]; then
253		silent_exec which $g_pagetool || invalid "no $g_pagetool tool on the system"
254		g_pagetool=`which $g_pagetool`
255		dbp "Found the tool: $g_pagetool"
256	fi
257	if [ $g_pfninj -eq 1 ]; then
258		if [ $g_soft_offline -eq 1 ]; then
259		        [ -f $g_sysfs_mem/soft_offline_page ] || invalid "pls. ensure soft_offline_page is enabled"
260		else
261			#if hwpoison_inject is a module, it is ensured to have been loaded
262			modinfo hwpoison_inject > /dev/null 2>&1
263			if [ $? -eq 0 ]; then
264			        [ -d $g_debugfs/hwpoison/ ] || modprobe hwpoison_inject
265			        [ $? -eq 0 ] || invalid "module hwpoison_inject isn't supported ?"
266			fi
267		fi
268	fi
269	[ $g_recycle -ne 0 ] && {
270	        [ -f $g_debugfs/hwpoison/unpoison-pfn ] || invalid "pls. insmod hwpoison_inject module with unpoison-pfn support"
271	}
272	if [ $g_apei -eq 1 ]; then
273		#if einj is a module, it is ensured to have been loaded
274		modinfo einj > /dev/null 2>&1
275		if [ $? -eq 0 ]; then
276			[ -d $g_debugfs/apei/einj ] || modprobe einj
277			[ $? -eq 0 ] || invalid "module apei_inj isn't supported ?"
278		fi
279	fi
280	[ -d $g_ltproot -a -f $g_ltppan ] || invalid "no ltp-pan on the machine: $g_ltppan"
281	if [ $g_runltp -eq 1 ]; then
282		[ -d $g_ltproot -a -f $g_ltproot/runltp ] || invalid "no runltp on the machine"
283	fi
284	[ $g_duration -eq 0 ] && invalid "test duration is set as 0 second"
285}
286
287setup_log()
288{
289	mkdir -p $g_resultdir
290	rm -rf $g_logdir
291	mkdir -p $g_logdir
292	echo "# hwpoison.sh $g_parameter" > $g_logfile
293	echo "# hwpoison.sh $g_parameter" > $g_result
294	[ $g_test -eq 0 ] && clear > $g_tty
295	echo "# hwpoison.sh $g_parameter" > $g_tty
296}
297
298setup_env()
299{
300	begin "setup test environment"
301	mkdir -p $g_casedir
302	check_env
303	setup_errinj
304	setup_meminfo
305	trap "cleanup" 0
306	[ $g_test -eq 0 ] && setup_fs
307	export PATH="${PATH}:$g_bindir"
308	end "setup test environment"
309}
310
311run_ltp()
312{
313	local ltp_failed=$g_logdir/ltp/ltp_failed
314	local ltp_log=$g_logdir/ltp/ltp_log
315	local ltp_output=$g_logdir/ltp/ltp_output
316	local ltp_tmp=$g_testdir/ltp_tmp
317
318	begin "launch ltp workload in background"
319	mkdir -p $g_logdir/ltp
320	echo -n "" > $ltp_failed
321	echo -n "" > $ltp_log
322	echo -n "" > $ltp_output
323	mkdir -p $ltp_tmp
324	silent_exec_background $g_ltproot/runltp -d $ltp_tmp -l $ltp_log -o $ltp_output -r $g_ltproot -t ${g_duration}s -C $ltp_failed
325	g_pid_ltp=$!
326	end "launch ltp workload in background (pid: $g_pid_ltp)"
327}
328
329ltp_result()
330{
331	local num=0;
332	local ltp_failed=$g_logdir/ltp/ltp_failed
333	local ltp_output=$g_logdir/ltp/ltp_output
334
335	[ -f $ltp_failed ] || {
336		result "\tltp -- error: no ltp result there"
337		result "\t    log: $ltp_output"
338		g_failed=`expr $g_failed + 1`
339		return
340	}
341	num=`wc -l $ltp_failed | awk '{print $1}'`
342	if [ $num -ne 0 ]; then
343		result "\tltp -- $num case(s) failed"
344		result "\t    log: $ltp_output"
345		g_failed=`expr $g_failed + 1`
346	else
347		result "\tltp -- all tests pass"
348	fi
349}
350
351
352fs_metadata()
353{
354	local dir=$g_logdir/fs_metadata
355	local result=$dir/fs_metadata.result
356	local log=$dir/fs_metadata.log
357	local pan_log=$dir/pan_log
358	local pan_output=$dir/pan_output
359	local pan_zoo=$dir/pan_zoo
360	local pan_failed=$dir/pan_failed
361	local tmp=$g_testdir/fs_metadata
362	local threads=
363	local node_number=5
364	local tree_depth=6
365
366	if [ $g_children -eq 0 ]; then
367		let "threads= $g_duration / 720"
368	else
369		threads=$g_children
370	fi
371	[ $threads -gt 10 ] && threads=10 && node_number=6
372	[ $threads -eq 0 ] && threads=1
373
374	begin "launch fs_metadata workload"
375	mkdir -p $dir
376	echo -n "" > $pan_failed
377	echo -n "" > $pan_log
378	echo -n "" > $pan_output
379	echo -n "" > $pan_zoo
380	log "setup fs_metadata test environment"
381	silent_exec_background rm -rf $tmp
382	mkdir -p $tmp || err "cannot create dir: $tmp"
383
384	echo "fs_metadata fs-metadata.sh $tree_depth $node_number $threads $g_duration $result $tmp $log" > $g_casedir/fs_metadata
385	dbp "g_ltppan -n fs_metadata -a $pan_zoo -f $g_casedir/fs_metadata -o $pan_output -l $pan_log -C $pan_failed &"
386	silent_exec_background $g_ltppan -n fs_metadata -a $pan_zoo -f $g_casedir/fs_metadata -o $pan_output -l $pan_log -C $pan_failed
387	g_pid_fsmeta=$!
388	sleep $g_interval
389	silent_exec grep "abort" $log && err "failed to launch fs_metadata workload, it might be due to insufficient disk space, pls read $log for details!"
390	end "launch fs_metadata workload (pid: $g_pid_fsmeta)"
391}
392
393fs_metadata_result()
394{
395	local fail_num=0;
396	local pass_num=0;
397	local dir=$g_logdir/fs_metadata
398	local result=$dir/fs_metadata.result
399	local log=$dir/fs_metadata.log
400
401	[ -f $result ] || {
402		result "\tfs_metadata -- error: no result there"
403		result "\t    details: $log"
404		g_failed=`expr $g_failed + 1`
405		return
406	}
407	fail_num=`grep FAIL $result | awk -F : '{print $NF}'`
408	pass_num=`grep PASS $result | awk -F : '{print $NF}'`
409	[ -z "$fail_num" ] && fail_num=0 && pass_num=0
410	if [ $fail_num -ne 0 ]; then
411		result "\tfs_metadata -- $fail_num tests failed, $pass_num tests pass."
412		result "\t    details: $result"
413		g_failed=`expr $g_failed + 1`
414	else
415		if [ $pass_num -eq 0 ]; then
416			result "\tfs_metadata -- no test finished"
417			result "\t    details: $log"
418			g_failed=`expr $g_failed + 1`
419		else
420			result "\tfs_metadata -- all $pass_num tests got pass"
421		fi
422	fi
423
424	return
425}
426
427# fs_specific workload, TBD
428fs_specific()
429{
430	begin "launch $g_fstype specific workload"
431
432	touch $g_logdir/fs_specific
433#	$g_ltppan -n fs_specific -a $g_logdir/fs_specific -f $g_casedir/fs_specific -t ${g_duration}s &
434	end "launch $g_fstype specific workload"
435}
436
437page_poisoning()
438{
439	local dir=$g_logdir/page_poisoning
440	local pan_failed=$dir/pan_failed
441	local pan_log=$dir/pan_log
442	local pan_output=$dir/pan_output
443	local tmp=$g_testdir/page_poisoning
444	local pan_zoo=$dir/pan_zoo
445	local result=$dir/page_poisoning.result
446	local log=$dir/page_poisoning.log
447	local opts=
448
449	begin "-- launch page_poisoning test"
450	mkdir -p $dir
451	echo -n "" > $pan_failed
452	echo -n "" > $pan_log
453	echo -n "" > $pan_output
454	echo -n "" > $pan_zoo
455	echo -n "" > $log
456	echo -n "" > $result
457	mkdir -p $tmp || err "cannot create dir: $tmp"
458
459	[ $g_children -ne 0 ] && opts="-i $g_children"
460
461	echo "page_poisoning page-poisoning -l $log -r $result -t $tmp $opts" > $g_casedir/page_poisoning
462	dbp "$g_ltppan -n page_poisoning -a $pan_zoo -f $g_casedir/page_poisoning -t ${g_duration}s -o $pan_output -l $pan_log -C $pan_failed &"
463	silent_exec_background $g_ltppan -n page_poisoning -a $pan_zoo -f $g_casedir/page_poisoning -t ${g_duration}s -o $pan_output -l $pan_log -C $pan_failed
464	g_pid_madv=$!
465	end "-- launch page_poisoning test (pid: $g_pid_madv)"
466}
467
468page_poisoning_result()
469{
470	local fail_num=0
471	local pass_num=0
472	local dir=$g_logdir/page_poisoning
473	local result=$dir/page_poisoning.result
474	local log=$dir/page_poisoning.log
475
476	[ -f $result ] || {
477		result "\tpage_poisoning -- error: no result file there"
478		result "\t    details: $log"
479		g_failed=`expr $g_failed + 1`
480		return
481	}
482	fail_num=`grep FAILED $result | wc -l | awk '{print $1}'`
483	pass_num=`grep PASS $result | wc -l | awk '{print $1}'`
484	if [ $fail_num -ne 0 ]; then
485		result "\tpage_poisoning -- $fail_num tests failed, $pass_num tests pass."
486		result "\t    details: $result"
487		g_failed=`expr $g_failed + 1`
488	else
489		if [ $pass_num -eq 0 ]; then
490			result "\tpage_poisoning -- no case finished"
491			result "\t    details: $log"
492			g_failed=`expr $g_failed + 1`
493		else
494			result "\tpage_poisoning -- all $pass_num tests got pass"
495		fi
496	fi
497
498	return
499}
500
501run_workloads()
502{
503	fs_metadata
504	#fs_specific
505	return
506}
507
508_pfn_unpoison()
509{
510	local pg=$1
511
512	echo $pg > $g_debugfs/hwpoison/unpoison-pfn
513	dbp "echo $pg > $g_debugfs/hwpoison/unpoison-pfn"
514}
515
516pfn_unpoison()
517{
518	local pg_list=
519	local pg=0
520	local pfn=0
521	local cur=
522	local i=0
523	local inj=_pfn_unpoison
524
525	pg_list=`$g_pagetool -NLrb hwpoison | grep -v offset | cut -f1`
526	for pg in $pg_list
527	do
528		$inj 0x$pg > /dev/null 2>&1
529	done
530}
531
532show_progress()
533{
534	local cur=
535	local rest=0
536	local percent=0
537	local next=0
538	local msg="hwpoison page error injection"
539
540	[ $g_soft_offline -eq 1 ] && msg="page soft offline"
541
542	cur=`date +%s`
543	[ "$cur" -ge "$g_time_e" ] && return
544	rest=`expr $g_time_e - $cur`
545	let "percent= ($g_duration - $rest) * 100 / $g_duration"
546	[ $percent -eq 0 ] && return
547	if [ $g_recycle -ne 0 ]; then
548		let "g_last=(($percent-$g_percent)*$g_duration)+$g_last"
549		[ $g_last -ge $g_recycle ] && {
550			g_last=0
551			pfn_unpoison
552		}
553	fi
554	[ $percent -gt 10 ] && let "next= $percent - 10"
555	[ $g_percent -ne 0 -a $g_percent -gt $next ] && return
556	g_percent=$percent
557	log "$msg: $g_percent% pages done"
558}
559
560_pfn_hwpoison()
561{
562	local pfn=$1
563
564	echo $pfn > $g_debugfs/hwpoison/corrupt-pfn
565	dbp "echo $pfn > $g_debugfs/hwpoison/corrupt-pfn"
566}
567
568_pfn_soft_offline()
569{
570	local pfn=$1
571	local i
572	local j
573	local paddr
574
575	i=`printf "%i" $pfn`
576	let "j=$i * $g_pgsize"
577	paddr=`printf "0x%x" $j`
578	echo $paddr > $g_sysfs_mem/soft_offline_page
579	dbp "echo $paddr > $g_sysfs_mem/soft_offline_page"
580}
581
582pfn_inj()
583{
584	local pg_list=
585	local pg=0
586	local pfn=0
587	local cur=
588	local i=0
589	local inj=_pfn_hwpoison
590
591	[ $g_soft_offline -eq 1 ] && inj=_pfn_soft_offline
592	if [ $g_pgtype = "all" ]; then
593		pfn=$g_lowmem_s 	# start from 1M.
594		while [ "$pfn" -lt "$g_maxpfn" ]
595		do
596			pg=`printf "%x" $pfn`
597			$inj 0x$pg > /dev/null 2>&1
598			pfn=`expr $pfn + 1`
599			[ $pfn -gt $g_lowmem_e ] && pfn=$g_highmem_s
600			[ $pfn -gt $g_highmem_e ] && break
601			i=`expr $i + 1`
602			if [ $i -eq $g_progress ]; then
603				cur=`date +%s`
604				[ "$cur" -ge "$g_time_e" ] && break
605				show_progress
606				i=0
607			fi
608		done
609	else
610		silent_exec $g_pagetool -Nrb $g_pgtype || err "unsupported pagetype, pls. refer to command: $g_pagetool -h"
611		pg_list=`$g_pagetool -NLrb $g_pgtype | grep -v offset | cut -f1`
612		for pg in $pg_list
613		do
614			$inj 0x$pg > /dev/null 2>&1
615			i=`expr $i + 1`
616			if [ $i -eq $g_progress ]; then
617				cur=`date +%s`
618				[ "$cur" -ge "$g_time_e" ] && break
619				show_progress
620				i=0
621			fi
622		done
623	fi
624}
625
626_apei_inj()
627{
628	local pfn=`printf "%x" $1`
629	local type=$2
630
631	echo $type > $g_debugfs/apei/einj/error_type
632	echo "0x${pfn}000" > $g_debugfs/apei/err_inj/error_address
633	echo "1" > $g_debugfs/apei/einj/error_inject
634}
635
636apei_ewb_ucr()
637{
638	_apei_inj $1 0x2
639}
640
641apei_mem_ucr()
642{
643	_apei_inj $1 0x10
644}
645
646apei_inj()
647{
648	local pg_list=
649	local pg=
650	local cur=
651	local i=0
652
653	pg_list=`$g_pagetool -NLrb $g_pgtype | grep -v offset | cut -f1`
654	for pg in $pg_list
655	do
656		apei_mem_ucr $pg
657		i=`expr $i + 1`
658		if [ $i -eq $g_progress ]; then
659			cur=`date +%s`
660			[ "$cur" -ge "$g_time_e" ] && break
661			show_progress
662			i=0
663		fi
664	done
665
666	return
667}
668
669err_inject()
670{
671	local cur=
672	local i=0
673	local msg="hwpoison page error injection"
674	local MSG="inject HWPOISON error to pages"
675
676	if [ $g_soft_offline -eq 1 ]; then
677	        msg="page soft offline"
678	        MSG="soft OFFLINE pages"
679	fi
680	if [ $g_madvise -eq 1 ]; then
681		begin "$MSG thru madvise syscall"
682	else
683		begin "$MSG ($g_pgtype)"
684	fi
685	let "g_progress=$g_duration * 10"
686	g_time_s=`date +%s`
687	g_time_e=`expr $g_time_s + $g_duration`
688	cur=$g_time_s
689	if [ $g_madvise -eq 1 ]; then
690		page_poisoning
691		log "$msg: 0% pages done"
692		show_progress
693	else
694	        log "$msg: 0% pages done"
695	fi
696	while [ "$cur" -lt "$g_time_e" ]
697	do
698		if [ $g_madvise -eq 0 ]; then
699			show_progress
700			[ $g_apei -eq 1 ] && apei_inj
701			[ $g_pfninj -eq 1 ] && pfn_inj
702		else
703			if [ $i -eq $g_progress ]; then
704				show_progress
705				i=0
706			fi
707			i=`expr $i + 1`
708		fi
709		cur=`date +%s`
710	done
711	log "$msg: 100% pages done"
712	# wait workloads to be finished.
713	sleep $g_interval
714
715	if [ $g_madvise -eq 1 ]; then
716		end "$MSG thru madvise syscall"
717	else
718		end "$MSG ($g_pgtype)"
719	fi
720}
721
722fsck_err()
723{
724	local dir=$g_logdir/fsck
725	local result=$dir/fsck.result
726	local log=$dir/fsck.log
727
728	echo "FAILED: $@" > $result
729	echo "FAILED: $@" > $log
730}
731
732fsck_pass()
733{
734	local dir=$g_logdir/fsck
735	local result=$dir/fsck.result
736	local log=$dir/fsck.log
737
738	echo "PASS: $@" > $result
739	echo "PASS: $@" > $log
740}
741
742run_fsck()
743{
744	local dir=$g_logdir/fsck
745	local result=$dir/fsck.result
746	local log=$dir/fsck.log
747	local fsck=fsck.$g_fstype
748	local opts=""
749
750	mkdir -p $dir
751	echo -n "" > $log
752	echo -n "" > $result
753
754	[ $g_fstype = "btrfs" ] && fsck="btrfsck"
755	[ $g_fstype = "reiserfs" ] && {
756	        fsck="reiserfsck"
757	        opts="-y"
758	}
759	begin "launch $fsck on $g_dev to check test result"
760	silent_exec which $fsck || {
761		fsck_err "fsck: unsupported fstype: $g_fstype"
762		return
763	}
764	fs_sync
765	silent_exec umount -f $g_dev || sleep $g_interval
766	df | grep $g_dev > /dev/null 2>&1
767	if [ $? -eq 0 ]; then
768		silent_exec umount $g_dev || {
769			fsck_err "cannot umount $g_dev to do $fsck"
770			return
771		}
772	fi
773	$fsck $opts $g_dev || fsck_err "err #$? while $fsck on $g_dev"
774	silent_exec mount -t $g_fstype $g_dev $g_testdir || {
775		fsck_err "cannot mount $g_testdir back after fsck_check"
776		return
777	}
778	fsck_pass "$fsck got pass on $g_dev"
779	end "launch $fsck on $g_dev to check test result"
780}
781
782fsck_result()
783{
784	local dir=$g_logdir/fsck
785	local result=$dir/fsck.result
786	local log=$dir/fsck.log
787	local fail_num=0;
788	local pass_num=0;
789	[ -f $result ] || {
790		result "\tfsck.$g_fstype -- no result found"
791		result "\t    details: $log"
792		g_failed=`expr $g_failed + 1`
793		return
794	}
795
796	fail_num=`grep FAILED $result | wc -l | awk '{print $1}'`
797	pass_num=`grep PASS $result | wc -l | awk '{print $1}'`
798	if [ $fail_num -ne 0 ]; then
799		result "\tfsck.$g_fstype -- failed"
800		result "\t    log: $log"
801		g_failed=`expr $g_failed + 1`
802	else
803		if [ $pass_num -eq 0 ]; then
804			result "\tfsck.$g_fstype -- not executed"
805			result "\t    log: $log"
806			g_failed=`expr $g_failed + 1`
807		else
808			result "\tfsck.$g_fstype -- fsck on $g_dev got pass"
809		fi
810	fi
811}
812
813result_check()
814{
815	begin "-- collecting test result"
816	result "#############################################"
817	result "result summary:"
818	if [ $g_madvise -eq 1 ]; then
819		page_poisoning_result
820	else
821		fs_metadata_result
822		[ $g_runltp -eq 1 ] && ltp_result
823	fi
824	[ $g_netfs -eq 0 -a $g_test -eq 0 ] && fsck_result
825	result ""
826	result "totally $g_failed task-groups report failures"
827	result "#############################################"
828	end "-- collecting test result"
829}
830
831usage()
832{
833	echo "Usage: ./hwpoison.sh -d /dev/device [-options] [arguments]"
834	echo
835	echo "Stress Testing for Linux MCA High Level Handlers: "
836	echo -e "\t-c console\t: target tty console to print test log"
837	echo -e "\t-d device\t: target block device to run test on"
838	echo -e "\t-f fstype\t: filesystem type to be tested"
839	echo -e "\t-i interval\t: sleep interval (default is $g_interval seconds)"
840	echo -e "\t-l logfile\t: log file"
841	echo -e "\t-n netdev\t: target network disk to run test on"
842	echo -e "\t-o ltproot\t: ltp root directory (default is $g_ltproot/)"
843	echo -e "\t-p pagetype\t: page type to inject error "
844	echo -e "\t-r result\t: result file"
845	echo -e "\t-s pagesize\t: page size on the system (default is $g_pgsize bytes)"
846	echo -e "\t-t duration\t: test duration time (default is $g_duration seconds)"
847	echo -e "\t-A \t\t: use APEI to inject error"
848	echo -e "\t-C children\t: process num of workloads"
849	echo -e "\t-F \t\t: execute as force mode, no interaction with user"
850	echo -e "\t-L \t\t: run ltp in background"
851	echo -e "\t-M \t\t: run page_poisoning test thru madvise syscall"
852	echo -e "\t-N \t\t: do not mkfs target block device"
853	echo -e "\t-R recyle\t: automatically unpoison pages after running recyle seconds"
854	echo -e "\t-S \t\t: test soft page offline"
855	echo -e "\t-T \t\t: test mode, run test in local dir other than on target device"
856	echo -e "\t-V \t\t: verbose mode, show debug info"
857	echo -e "\t-h \t\t: print this page"
858	echo
859	echo -e "device:"
860	echo -e "\tthis is a mandatory argument. typically, it's a disk partition."
861	echo -e "\tall temporary files will be created on this device."
862	echo -e "\terror injector will just inject errors to the pages associated"
863	echo -e "\twith this device (except for the testing thru madvise syscall)."
864	echo
865	echo -e "pagetype:"
866 	echo -e "\tdefault page type:"
867	echo -e "\t    $g_pgtype"
868	echo -e "\tfor more details, pls. try \`page-types -h\`."
869	echo -e "\tsee the definition of \"bits-spec\"."
870	echo
871	echo -e "console:"
872	echo -e "\ttest can print output to the console you specified."
873	echo -e "\te.g. '-c /dev/tty1'"
874	echo
875
876	exit 0
877}
878
879fs_sync()
880{
881	log "now to sync up the disk under testing, might need several minutes ..."
882	sync
883}
884
885stop_children()
886{
887	begin "-- cleaning up remaining tasks in background"
888	if [ -n "$g_pid_madv" ]; then
889		silent_exec ps $g_pid_madv
890		[ $? -eq 0 ] && {
891			kill -15 $g_pid_madv > /dev/null 2>&1
892			sleep $g_interval
893		}
894	fi
895	if [ -n "$g_pid_fsmeta" ]; then
896		silent_exec ps $g_pid_fsmeta
897		[ $? -eq 0 ] && {
898			kill -15 $g_pid_fsmeta > /dev/null 2>&1
899			sleep $g_interval
900		}
901	fi
902	if [ -n "$g_pid_ltp" ]; then
903		silent_exec ps $g_pid_ltp
904		[ $? -eq 0 ] && {
905			kill -15 $g_pid_ltp > /dev/null 2>&1
906			sleep $g_interval
907		}
908	fi
909	end "-- cleaning up remaining tasks in background"
910}
911
912cleanup()
913{
914	log "!!! EXIT signal received, need to exit testing now. !!!"
915	begin "preparing to complete testing"
916	stop_children
917	fs_sync
918	result_check
919	if [ $g_netfs -eq 0 ]; then
920		df | grep $g_dev > /dev/null 2>&1 && silent_exec umount -f $g_dev
921	else
922		df | grep $g_netdev > /dev/null 2>&1 && silent_exec umount -f $g_netdev
923	fi
924	if [ $g_madvise -eq 1 ]; then
925	        echo $g_vm_dirty_background_ratio > /proc/sys/vm/dirty_background_ratio
926	        echo $g_vm_dirty_ratio > /proc/sys/vm/dirty_ratio
927	        echo $g_vm_dirty_expire_centisecs > /proc/sys/vm/dirty_expire_centisecs
928	fi
929	end "preparing to complete testing"
930	log "!!! Linux HWPOISON stress testing DONE !!!"
931	log "result: $g_result"
932	log "log: $g_logfile"
933	if [ $g_failed -ne 0 ]; then
934		exit 1
935	else
936		exit 0
937	fi
938}
939
940select_injector()
941{
942# for test mode, apei injector is not supported.
943	if [ $g_test -eq 1 ]; then
944		[ $g_apei -eq 1 ] && g_apei=0
945		if [ $g_madvise -eq 1 ]; then
946			g_pfninj=0
947		else
948			g_soft_offline=1
949		fi
950	fi
951
952# for non-test mode, apei injector is 1st priority.
953	if [ $g_apei -eq 1 ]; then
954		g_pfninj=0
955		g_madvise=0
956	fi
957
958	if [ $g_madvise -eq 1 ]; then
959		g_pfninj=0
960	fi
961}
962
963g_children=0	# child process num for each workload.
964		# 0 means using default child process num of each workload.
965g_dev=
966g_debugfs=
967g_netdev=
968g_fstype=ext3
969g_netfs=0
970g_nomkfs=0
971g_force=0
972let "g_duration=120"
973g_interval=5
974g_runltp=0
975g_ltproot="/ltp"
976g_ltppan="$g_ltproot/pan/ltp-pan"
977g_pagetool="page-types"
978g_madvise=0
979g_apei=0
980g_pfninj=1
981g_rootdir=`pwd`
982g_bindir=$g_rootdir/bin
983g_casedir=$g_rootdir/runtest
984g_logdir=$g_rootdir/log
985g_testdir=$g_rootdir/hwpoison
986g_resultdir=$g_rootdir/result
987g_logfile=$g_resultdir/hwpoison.log
988g_result=$g_resultdir/hwpoison.result
989g_failed=0
990g_time_s=
991g_time_e=
992g_tty=`tty`
993g_pid_madv=
994g_pid_fsmeta=
995g_pid_ltp=
996g_progress=
997g_percent=0
998g_pgtype="lru,referenced,readahead,swapcache,swapbacked,anonymous"
999g_pgsize=4096	# page size on the system
1000g_maxpfn=	# maxpfn on the system
1001g_highmem_s=	# start pfn of highmem
1002g_highmem_e=	# end pfn of highmem
1003g_lowmem_s=	# start pfn of mem < 4G
1004g_lowmem_e=	# end pfn of mem < 4G
1005g_sysfs_mem="/sys/devices/system/memory"
1006g_soft_offline=0
1007g_test=0
1008
1009# recyle poisoned page
1010g_recycle=0
1011g_last=0
1012
1013# madvise injector specific global variable
1014g_vm_dirty_background_ratio=`cat /proc/sys/vm/dirty_background_ratio`
1015g_vm_dirty_ratio=`cat /proc/sys/vm/dirty_ratio`
1016g_vm_dirty_expire_centisecs=`cat /proc/sys/vm/dirty_expire_centisecs`
1017
1018# test parameters
1019g_parameter=$@
1020
1021while getopts ":c:d:f:hi:l:n:o:p:r:s:t:C:LMR:STAFNV" option
1022do
1023	case $option in
1024		c) g_tty=$OPTARG;;
1025		d) g_dev=$OPTARG;;
1026		f) g_fstype=$OPTARG;;
1027		l) g_logfile=$OPTARG;;
1028		t) g_duration=$OPTARG;;
1029		i) g_interval=$OPTARG;;
1030		n) g_netdev=$OPTARG;;
1031		o) g_ltproot=$OPTARG
1032		   g_ltppan="$g_ltproot/pan/ltp-pan";;
1033		p) g_pgtype=$OPTARG;;
1034		s) g_pgsize=$OPTARG;;
1035		r) g_result=$OPTARG;;
1036		C) g_children=$OPTARG;;
1037		L) g_runltp=1;;
1038		M) g_madvise=1;;
1039		R) g_recycle=$OPTARG;;
1040		S) g_soft_offline=1;;
1041		T) g_test=1;;
1042		A) g_apei=1;;
1043		F) g_force=1;;
1044		N) g_nomkfs=1;;
1045		V) DEBUG=1;;
1046		h) usage;;
1047		*) invalid "invalid option";;
1048	esac
1049done
1050
1051select_injector
1052setup_log
1053log "!!! Linux HWPOISON stress testing starts NOW !!!"
1054log "!!! test will run about $g_duration seconds !!!"
1055setup_env
1056if [ $g_madvise -eq 0 ]; then
1057	[ $g_runltp -eq 1 ] && run_ltp
1058	run_workloads
1059fi
1060err_inject
1061[ $g_netfs -eq 0 -a $g_test -eq 0 ] &&  run_fsck
1062