1#!/bin/bash 2# randomly soft offline pages 3# random_offline options 4# -t seconds runtime in seconds (default unlimited) 5# -m max-pages maximum pages to tie up before unpoisoning 6# -s seed random seed 7# Note: running this for too long may still run out of memory 8# because unpoison cannot completely undo what soft offline 9# does to larger free memory areas (TBD in the kernel) 10# Author: Andi Kleen 11 12# fixme: uses time seed, non reproducible 13 14#mount -t debugfs none /debug 15 16THRESH=1000 17SEED="" 18RUNTIME="" 19DEBUG=/sys/kernel/debug 20 21fail() { 22 echo "ERROR: $@" 23 exit 0 24} 25 26usage() { 27 echo "Usage:" 28 echo "random_offline options" 29 echo -- "-t seconds runtime in seconds (default unlimited)" 30 echo -- "-m max-pages maximum pages to tie up before unpoisoning" 31 echo -- "-s seed random seed" 32 fail "Invalid option $1" 33} 34 35while getopts "t:m:s:" option ; do 36 case "$option" in 37 t) RUNTIME=$OPTARG ;; 38 m) THRESH=$OPTARG ;; 39 s) SEED=$OPTARG ;; 40 *) usage $option ;; 41 esac 42done 43 44[ "$(whoami)" != root ] && fail "Not root" 45[ ! -d $DEBUG/hwpoison ] && mount -t debugfs none $DEBUG 46[ ! -d $DEBUG/hwpoison ] && fail "No debugfs" 47[ ! -w /sys/devices/system/memory/soft_offline_page ] && fail "No soft offlining support in kernel" 48[ ! -w $DEBUG/hwpoison/unpoison-pfn ] && fail "no unpoison support in kernel" 49 50end_of_memory() { 51 for i in /sys/firmware/memmap/* ; do 52 case "$(< $i/type)" in 53 "System RAM") ;; 54 *) continue ;; 55 esac 56 57 k=$(< $i/end) 58 k=${k/0x/} 59 k=$(echo $k | tr a-z A-Z) 60 61 echo "ibase=16; $k/1000" | bc 62 done | sort -n | tail -n1 63} 64 65E=$(end_of_memory) 66 67echo "soft offlining pages upto $E" 68 69unpoison() { 70 if [ ! -f offlined ] ; then 71 return 72 fi 73 74 echo unpoisioning 75 while read i ; do 76 #echo -n , 77 #echo "u $i" 78 (( utotal++ )) 79 if ! echo $i | sed 's/000$//' > $DEBUG/hwpoison/unpoison-pfn ; then 80 echo "$i $?" >> unpoison-failed 81 echo "unpoisioning $i failed: $?" 82 else 83 (( usuccess++ )) 84 fi 85 done < offlined 86 echo done 87 echo 88} 89 90trap unpoison 0 91 92if [ "$SEED" = "" ] ; then 93 SEED=$(date +%s) 94fi 95RANDOM=$SEED 96echo "Using random seed $SEED" 97 98start=$(date +%s) 99failed=0 100ufailed=0 101success=0 102usuccess=0 103total=0 104utotal=0 105 106cbefore=$(grep HardwareCorrupted /proc/meminfo) 107 108 109(( k = 0 )) 110rm -f offlined unpoison-failed 111while true ; do 112 T=$( 113 R=$RANDOM 114 X=$(echo "obase=16; ($R%$E)*4096" | bc) 115 echo 0x$X 116 ) 117 #echo "p $T" 118 (( total++ )) 119 if echo 2>/dev/null $T >/sys/devices/system/memory/soft_offline_page ; then 120 echo $T >> offlined 121 (( success++ )) 122 else 123 #echo offlining $T failed $? 124 (( failed++ )) 125 true 126 fi 127 #echo -n . 128 129 (( k++ )) 130 if [ $k -gt $THRESH ] ; then 131 unpoison 132 (( k = 0 )) 133 rm offlined 134 fi 135 136 if [ ! -z "$RUNTIME" ] ; then 137 ((DIFF = $(date +%s) - $start)) 138 if [ $DIFF -gt "$RUNTIME" ] ; then 139 echo time over 140 trap 0 141 break 142 fi 143 fi 144done 145 146if [ -f unpoison-failed ] ; then 147 ufailed=$(wc -l unpoison-failed | awk ' {print $1}') 148fi 149echo "soft-poison: success $success failed $failed of total $total" 150echo "unpoison-failed: success $usuccess failed $ufailed of total $utotal" 151echo "poisoned before: $cbefore" 152echo -n "poisoned after: " 153grep HardwareCorrupted /proc/meminfo 154 155### xxx automatic success/failure criteria? 156 157