#!/bin/bash # randomly soft offline pages # random_offline options # -t seconds runtime in seconds (default unlimited) # -m max-pages maximum pages to tie up before unpoisoning # -s seed random seed # Note: running this for too long may still run out of memory # because unpoison cannot completely undo what soft offline # does to larger free memory areas (TBD in the kernel) # Author: Andi Kleen # fixme: uses time seed, non reproducible #mount -t debugfs none /debug THRESH=1000 SEED="" RUNTIME="" DEBUG=/sys/kernel/debug fail() { echo "ERROR: $@" exit 0 } usage() { echo "Usage:" echo "random_offline options" echo -- "-t seconds runtime in seconds (default unlimited)" echo -- "-m max-pages maximum pages to tie up before unpoisoning" echo -- "-s seed random seed" fail "Invalid option $1" } while getopts "t:m:s:" option ; do case "$option" in t) RUNTIME=$OPTARG ;; m) THRESH=$OPTARG ;; s) SEED=$OPTARG ;; *) usage $option ;; esac done [ "$(whoami)" != root ] && fail "Not root" [ ! -d $DEBUG/hwpoison ] && mount -t debugfs none $DEBUG [ ! -d $DEBUG/hwpoison ] && fail "No debugfs" [ ! -w /sys/devices/system/memory/soft_offline_page ] && fail "No soft offlining support in kernel" [ ! -w $DEBUG/hwpoison/unpoison-pfn ] && fail "no unpoison support in kernel" end_of_memory() { for i in /sys/firmware/memmap/* ; do case "$(< $i/type)" in "System RAM") ;; *) continue ;; esac k=$(< $i/end) k=${k/0x/} k=$(echo $k | tr a-z A-Z) echo "ibase=16; $k/1000" | bc done | sort -n | tail -n1 } E=$(end_of_memory) echo "soft offlining pages upto $E" unpoison() { if [ ! -f offlined ] ; then return fi echo unpoisioning while read i ; do #echo -n , #echo "u $i" (( utotal++ )) if ! echo $i | sed 's/000$//' > $DEBUG/hwpoison/unpoison-pfn ; then echo "$i $?" >> unpoison-failed echo "unpoisioning $i failed: $?" else (( usuccess++ )) fi done < offlined echo done echo } trap unpoison 0 if [ "$SEED" = "" ] ; then SEED=$(date +%s) fi RANDOM=$SEED echo "Using random seed $SEED" start=$(date +%s) failed=0 ufailed=0 success=0 usuccess=0 total=0 utotal=0 cbefore=$(grep HardwareCorrupted /proc/meminfo) (( k = 0 )) rm -f offlined unpoison-failed while true ; do T=$( R=$RANDOM X=$(echo "obase=16; ($R%$E)*4096" | bc) echo 0x$X ) #echo "p $T" (( total++ )) if echo 2>/dev/null $T >/sys/devices/system/memory/soft_offline_page ; then echo $T >> offlined (( success++ )) else #echo offlining $T failed $? (( failed++ )) true fi #echo -n . (( k++ )) if [ $k -gt $THRESH ] ; then unpoison (( k = 0 )) rm offlined fi if [ ! -z "$RUNTIME" ] ; then ((DIFF = $(date +%s) - $start)) if [ $DIFF -gt "$RUNTIME" ] ; then echo time over trap 0 break fi fi done if [ -f unpoison-failed ] ; then ufailed=$(wc -l unpoison-failed | awk ' {print $1}') fi echo "soft-poison: success $success failed $failed of total $total" echo "unpoison-failed: success $usuccess failed $ufailed of total $utotal" echo "poisoned before: $cbefore" echo -n "poisoned after: " grep HardwareCorrupted /proc/meminfo ### xxx automatic success/failure criteria?