| #! /bin/bash |
| # |
| # Stress test driver for Linux MCA High Level Handlers |
| # |
| # This program is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU General Public |
| # License as published by the Free Software Foundation; version |
| # 2. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # General Public License for more details. |
| # |
| # You should find a copy of v2 of the GNU General Public License somewhere |
| # on your Linux system; if not, write to the Free Software Foundation, |
| # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| # |
| # Copyright (C) 2009, Intel Corp. |
| # Author: Haicheng Li <haicheng.li@intel.com> |
| # |
| |
| #set -x |
| sd=$(dirname "$0") |
| export ROOT=`(cd $sd/..; pwd)` |
| |
| . $ROOT/lib/mce.sh |
| |
| DEBUG=0 |
| |
| silent_exec() |
| { |
| local cmd=$@ |
| |
| if [ $DEBUG -eq 0 ]; then |
| $cmd > /dev/null 2>&1 |
| else |
| $cmd |
| fi |
| return $? |
| } |
| |
| silent_exec_background() |
| { |
| local cmd=$@ |
| |
| if [ $DEBUG -eq 0 ]; then |
| $cmd > /dev/null 2>&1 & |
| else |
| $cmd & |
| fi |
| return $? |
| } |
| |
| _print() |
| { |
| echo $* > $g_tty |
| } |
| |
| dbp() |
| { |
| [ $DEBUG -ne 1 ] && return |
| _print -en "\\033[0;33m" # set font color as yellow |
| _print "[debug] $*" > $g_tty |
| echo "[debug] $*" >> $g_logfile |
| _print -en "\\033[0;39m" # restore font color to normal |
| } |
| |
| log() |
| { |
| _print -en "\\033[0;33m" # set font color as yellow |
| _print "[info] $*" > $g_tty |
| echo "[info] $*" >> $g_logfile |
| _print -en "\\033[0;39m" # restore font color to normal |
| } |
| |
| begin() |
| { |
| _print -n "$*" > $g_tty |
| _print -en "\\033[0;32m" # set font color as green |
| _print -e "\t [start]" > $g_tty |
| echo -e "$* \t [start]" >> $g_logfile |
| _print -en "\\033[0;39m" # restore font color to normal |
| } |
| |
| end() |
| { |
| _print -n "$*" > $g_tty |
| _print -en "\\033[0;32m" # set font color as green |
| _print -e "\t [done]" > $g_tty |
| echo -e "$* \t [done]" >> $g_logfile |
| _print -en "\\033[0;39m" # restore font color to normal |
| } |
| |
| err() |
| { |
| _print -en "\\033[0;31m" # set font color as red |
| echo > $g_tty |
| echo "Test aborted by unexpected error!" > $g_tty |
| _print "[error] !!! $* !!!" > $g_tty |
| echo > $g_tty |
| echo "Test aborted by unexpected error!" >> $g_result |
| echo "[error] !!! $* !!!" >> $g_result |
| echo "[error] !!! $* !!!" >> $g_logfile |
| _print -en "\\033[0;39m" # restore font color to normal |
| exit 1 |
| } |
| |
| die() |
| { |
| err $@ |
| } |
| |
| invalid() |
| { |
| _print -en "\\033[0;31m" # set font color as red |
| echo > $g_tty |
| echo "Test aborted by unexpected error!" > $g_tty |
| _print "[error] !!! $* !!!" > $g_tty |
| echo > $g_tty |
| echo "Try \`./hwposion -h\` for more information." > $g_tty |
| echo > $g_tty |
| echo "Test aborted by unexpected error!" >> $g_result |
| echo "[error] !!! $* !!!" >> $g_result |
| echo "[error] !!! $* !!!" >> $g_logfile |
| _print -en "\\033[0;39m" # restore font color to normal |
| exit 1 |
| } |
| |
| result() |
| { |
| _print -en "\\033[0;34m" # set font color as blue |
| _print -e "$*" > $g_tty |
| echo -e "$*" >> $g_result |
| echo -e "$*" >> $g_logfile |
| _print -en "\\033[0;39m" # restore font color to normal |
| } |
| |
| setup_meminfo() |
| { |
| local maxmem=0 |
| local lowmem_s=0 |
| local lowmem_e=0 |
| local highmem_s=0 |
| local highmem_e=0 |
| local tmp= |
| |
| lowmem_s=`printf "%i" 0x100000` # start pfn of mem < 4G |
| let "g_lowmem_s=$lowmem_s / $g_pgsize" |
| tmp=`cat /proc/iomem | grep "System RAM" | grep 100000- | awk -F "-" '{print $2}' | awk '{print $1}'` |
| lowmem_e=`printf "%i" "0x$tmp"` |
| let "g_lowmem_e=$lowmem_e / $g_pgsize" |
| log "low mem: 0x100000 (pfn: $g_lowmem_s) ~ 0x$tmp (pfn: $g_lowmem_e)" |
| |
| highmem_s=`printf "%i" 0x100000000` # start pfn of highmem > 4G |
| let "g_highmem_s=$highmem_s / $g_pgsize" |
| tmp=`cat /proc/iomem | grep "System RAM" | grep 100000000- | awk -F "-" '{print $2}' | awk '{print $1}'` |
| if [ -n "$tmp" ]; then |
| highmem_e=`printf "%i" "0x$tmp"` |
| let "g_highmem_e=$highmem_e / $g_pgsize" |
| log "high mem: 0x100000000 (pfn: $g_highmem_s) ~ 0x$tmp (pfn: $g_highmem_e)" |
| fi |
| |
| maxmem=`cat /proc/meminfo | grep MemTotal | awk '{print $2}'` |
| let "g_maxpfn= $maxmem / 4" |
| log "max pfn number: g_maxpfn = $g_maxpfn" |
| } |
| |
| setup_errinj() |
| { |
| local dev_major= |
| local dev_minor= |
| local rc=0 |
| |
| if [ $g_soft_offline -eq 1 ]; then |
| [ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-enable |
| return |
| fi |
| if [ $g_madvise -eq 1 ]; then |
| [ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-enable |
| # to avoid unexpected page-state changing in background while testing. |
| echo 70 > /proc/sys/vm/dirty_background_ratio |
| echo 70 > /proc/sys/vm/dirty_ratio |
| echo 1000000 > /proc/sys/vm/dirty_expire_centisecs |
| return |
| fi |
| dev_major=0x`/usr/bin/stat --format=%t $g_dev` > /dev/null 2>&1 |
| [ $? -ne 0 ] && rc=1 |
| dev_minor=0x`/usr/bin/stat --format=%T $g_dev` > /dev/null 2>&1 |
| [ $? -ne 0 ] && rc=1 |
| [ $rc -eq 1 ] && invalid "invalid device: no inode # can be found" |
| echo $dev_major > $g_debugfs/hwpoison/corrupt-filter-dev-major |
| echo $dev_minor > $g_debugfs/hwpoison/corrupt-filter-dev-minor |
| [ $g_pgtype = "all" -a -f "$g_debugfs/hwpoison/corrupt-filter-flags-mask" ] && echo 0 > $g_debugfs/hwpoison/corrupt-filter-flags-mask |
| [ -f "$g_debugfs/hwpoison/corrupt-filter-enable" ] && echo 1 > $g_debugfs/hwpoison/corrupt-filter-enable |
| return |
| } |
| |
| setup_fs() |
| { |
| local mkfs="mkfs.$g_fstype" |
| local mkfs_opts="-q" |
| local mount_opts |
| |
| [ $g_fstype = reiserfs ] && mkfs="mkreiserfs" |
| [ $g_fstype = ocfs2 ] && mkfs_opts="$mkfs_opts -M local" |
| [ $g_fstype = cifs ] && mount_opts="-o password=""" |
| mkdir -p $g_testdir || err "cannot mkdir $g_testdir" |
| if [ $g_nomkfs -eq 0 -a $g_netfs -eq 0 ]; then |
| silent_exec which $mkfs || err "mkfs: unsupported fstype: $g_fstype" |
| if [ $g_force -eq 0 -a $g_fstype != "ocfs2" ]; then |
| echo -n "test will format $g_dev to $g_fstype, continue [y/n]? " |
| read in |
| [ $in = 'y' -o $in = "yes" -o $in = 'Y' ] || err "$mkfs on $g_dev is cancelled" |
| fi |
| begin "-- $mkfs $g_dev" |
| if [ $g_fstype = "vfat" -o $g_fstype = "msdos" -o $g_fstype = "btrfs" ]; then |
| mkfs_opts="" |
| elif [ $g_fstype = "xfs" ]; then |
| mkfs_opts="-f" |
| fi |
| [ $g_fstype = ocfs2 ] && echo -n "test will format $g_dev to $g_fstype, continue [y/n]? " |
| silent_exec $mkfs $mkfs_opts $g_dev || err "cannot $mkfs $mkfs_opts on $g_dev" |
| end "-- $mkfs $g_dev" |
| fi |
| if [ $g_netfs -eq 0 ]; then |
| silent_exec mount -t $g_fstype $g_dev $g_testdir || err "cannot mount $g_fstype fs: $g_dev to $g_testdir" |
| else |
| silent_exec mount -t $g_fstype $mount_opts $g_netdev $g_testdir || err "cannot mount $g_fstype $mount_opts fs: $g_netdev to $g_testdir" |
| fi |
| } |
| |
| check_env() |
| { |
| check_debugfs |
| g_debugfs=`mount | grep debugfs | cut -d ' ' -f3` |
| [ -z "$g_tty" ] && invalid "$g_tty does not exist" |
| if [ $g_test -eq 0 ]; then |
| if [ $g_fstype = "nfs" -o $g_fstype = "cifs" ]; then |
| g_netfs=1 |
| [ -z $g_netdev ] && invalid "net device is not specified" |
| fi |
| [ -z "$g_dev" ] && invalid "device is not specified" |
| [ -b $g_dev ] || invalid "invalid device: $g_dev" |
| if [ $g_netfs -eq 0 ]; then |
| df | grep $g_dev > /dev/null 2>&1 && invalid "device $g_dev has been mounted by others" |
| else |
| df | grep $g_netdev > /dev/null 2>&1 && invalid "device $g_netdev has been mounted by others" |
| fi |
| fi |
| [ -d $g_bindir ] || invalid "no bin subdir there" |
| if [ $g_madvise -eq 0 -o $g_recycle -ne 0 ]; then |
| silent_exec which $g_pagetool || invalid "no $g_pagetool tool on the system" |
| g_pagetool=`which $g_pagetool` |
| dbp "Found the tool: $g_pagetool" |
| fi |
| if [ $g_pfninj -eq 1 ]; then |
| if [ $g_soft_offline -eq 1 ]; then |
| [ -f $g_sysfs_mem/soft_offline_page ] || invalid "pls. ensure soft_offline_page is enabled" |
| else |
| #if hwpoison_inject is a module, it is ensured to have been loaded |
| modinfo hwpoison_inject > /dev/null 2>&1 |
| if [ $? -eq 0 ]; then |
| [ -d $g_debugfs/hwpoison/ ] || modprobe hwpoison_inject |
| [ $? -eq 0 ] || invalid "module hwpoison_inject isn't supported ?" |
| fi |
| fi |
| fi |
| [ $g_recycle -ne 0 ] && { |
| [ -f $g_debugfs/hwpoison/unpoison-pfn ] || invalid "pls. insmod hwpoison_inject module with unpoison-pfn support" |
| } |
| if [ $g_apei -eq 1 ]; then |
| #if einj is a module, it is ensured to have been loaded |
| modinfo einj > /dev/null 2>&1 |
| if [ $? -eq 0 ]; then |
| [ -d $g_debugfs/apei/einj ] || modprobe einj |
| [ $? -eq 0 ] || invalid "module apei_inj isn't supported ?" |
| fi |
| fi |
| [ -d $g_ltproot -a -f $g_ltppan ] || invalid "no ltp-pan on the machine: $g_ltppan" |
| if [ $g_runltp -eq 1 ]; then |
| [ -d $g_ltproot -a -f $g_ltproot/runltp ] || invalid "no runltp on the machine" |
| fi |
| [ $g_duration -eq 0 ] && invalid "test duration is set as 0 second" |
| } |
| |
| setup_log() |
| { |
| mkdir -p $g_resultdir |
| rm -rf $g_logdir |
| mkdir -p $g_logdir |
| echo "# hwpoison.sh $g_parameter" > $g_logfile |
| echo "# hwpoison.sh $g_parameter" > $g_result |
| [ $g_test -eq 0 ] && clear > $g_tty |
| echo "# hwpoison.sh $g_parameter" > $g_tty |
| } |
| |
| setup_env() |
| { |
| begin "setup test environment" |
| mkdir -p $g_casedir |
| check_env |
| setup_errinj |
| setup_meminfo |
| trap "cleanup" 0 |
| [ $g_test -eq 0 ] && setup_fs |
| export PATH="${PATH}:$g_bindir" |
| end "setup test environment" |
| } |
| |
| run_ltp() |
| { |
| local ltp_failed=$g_logdir/ltp/ltp_failed |
| local ltp_log=$g_logdir/ltp/ltp_log |
| local ltp_output=$g_logdir/ltp/ltp_output |
| local ltp_tmp=$g_testdir/ltp_tmp |
| |
| begin "launch ltp workload in background" |
| mkdir -p $g_logdir/ltp |
| echo -n "" > $ltp_failed |
| echo -n "" > $ltp_log |
| echo -n "" > $ltp_output |
| mkdir -p $ltp_tmp |
| silent_exec_background $g_ltproot/runltp -d $ltp_tmp -l $ltp_log -o $ltp_output -r $g_ltproot -t ${g_duration}s -C $ltp_failed |
| g_pid_ltp=$! |
| end "launch ltp workload in background (pid: $g_pid_ltp)" |
| } |
| |
| ltp_result() |
| { |
| local num=0; |
| local ltp_failed=$g_logdir/ltp/ltp_failed |
| local ltp_output=$g_logdir/ltp/ltp_output |
| |
| [ -f $ltp_failed ] || { |
| result "\tltp -- error: no ltp result there" |
| result "\t log: $ltp_output" |
| g_failed=`expr $g_failed + 1` |
| return |
| } |
| num=`wc -l $ltp_failed | awk '{print $1}'` |
| if [ $num -ne 0 ]; then |
| result "\tltp -- $num case(s) failed" |
| result "\t log: $ltp_output" |
| g_failed=`expr $g_failed + 1` |
| else |
| result "\tltp -- all tests pass" |
| fi |
| } |
| |
| |
| fs_metadata() |
| { |
| local dir=$g_logdir/fs_metadata |
| local result=$dir/fs_metadata.result |
| local log=$dir/fs_metadata.log |
| local pan_log=$dir/pan_log |
| local pan_output=$dir/pan_output |
| local pan_zoo=$dir/pan_zoo |
| local pan_failed=$dir/pan_failed |
| local tmp=$g_testdir/fs_metadata |
| local threads= |
| local node_number=5 |
| local tree_depth=6 |
| |
| if [ $g_children -eq 0 ]; then |
| let "threads= $g_duration / 720" |
| else |
| threads=$g_children |
| fi |
| [ $threads -gt 10 ] && threads=10 && node_number=6 |
| [ $threads -eq 0 ] && threads=1 |
| |
| begin "launch fs_metadata workload" |
| mkdir -p $dir |
| echo -n "" > $pan_failed |
| echo -n "" > $pan_log |
| echo -n "" > $pan_output |
| echo -n "" > $pan_zoo |
| log "setup fs_metadata test environment" |
| silent_exec_background rm -rf $tmp |
| mkdir -p $tmp || err "cannot create dir: $tmp" |
| |
| echo "fs_metadata fs-metadata.sh $tree_depth $node_number $threads $g_duration $result $tmp $log" > $g_casedir/fs_metadata |
| dbp "g_ltppan -n fs_metadata -a $pan_zoo -f $g_casedir/fs_metadata -o $pan_output -l $pan_log -C $pan_failed &" |
| silent_exec_background $g_ltppan -n fs_metadata -a $pan_zoo -f $g_casedir/fs_metadata -o $pan_output -l $pan_log -C $pan_failed |
| g_pid_fsmeta=$! |
| sleep $g_interval |
| silent_exec grep "abort" $log && err "failed to launch fs_metadata workload, it might be due to insufficient disk space, pls read $log for details!" |
| end "launch fs_metadata workload (pid: $g_pid_fsmeta)" |
| } |
| |
| fs_metadata_result() |
| { |
| local fail_num=0; |
| local pass_num=0; |
| local dir=$g_logdir/fs_metadata |
| local result=$dir/fs_metadata.result |
| local log=$dir/fs_metadata.log |
| |
| [ -f $result ] || { |
| result "\tfs_metadata -- error: no result there" |
| result "\t details: $log" |
| g_failed=`expr $g_failed + 1` |
| return |
| } |
| fail_num=`grep FAIL $result | awk -F : '{print $NF}'` |
| pass_num=`grep PASS $result | awk -F : '{print $NF}'` |
| [ -z "$fail_num" ] && fail_num=0 && pass_num=0 |
| if [ $fail_num -ne 0 ]; then |
| result "\tfs_metadata -- $fail_num tests failed, $pass_num tests pass." |
| result "\t details: $result" |
| g_failed=`expr $g_failed + 1` |
| else |
| if [ $pass_num -eq 0 ]; then |
| result "\tfs_metadata -- no test finished" |
| result "\t details: $log" |
| g_failed=`expr $g_failed + 1` |
| else |
| result "\tfs_metadata -- all $pass_num tests got pass" |
| fi |
| fi |
| |
| return |
| } |
| |
| # fs_specific workload, TBD |
| fs_specific() |
| { |
| begin "launch $g_fstype specific workload" |
| |
| touch $g_logdir/fs_specific |
| # $g_ltppan -n fs_specific -a $g_logdir/fs_specific -f $g_casedir/fs_specific -t ${g_duration}s & |
| end "launch $g_fstype specific workload" |
| } |
| |
| page_poisoning() |
| { |
| local dir=$g_logdir/page_poisoning |
| local pan_failed=$dir/pan_failed |
| local pan_log=$dir/pan_log |
| local pan_output=$dir/pan_output |
| local tmp=$g_testdir/page_poisoning |
| local pan_zoo=$dir/pan_zoo |
| local result=$dir/page_poisoning.result |
| local log=$dir/page_poisoning.log |
| local opts= |
| |
| begin "-- launch page_poisoning test" |
| mkdir -p $dir |
| echo -n "" > $pan_failed |
| echo -n "" > $pan_log |
| echo -n "" > $pan_output |
| echo -n "" > $pan_zoo |
| echo -n "" > $log |
| echo -n "" > $result |
| mkdir -p $tmp || err "cannot create dir: $tmp" |
| |
| [ $g_children -ne 0 ] && opts="-i $g_children" |
| |
| echo "page_poisoning page-poisoning -l $log -r $result -t $tmp $opts" > $g_casedir/page_poisoning |
| dbp "$g_ltppan -n page_poisoning -a $pan_zoo -f $g_casedir/page_poisoning -t ${g_duration}s -o $pan_output -l $pan_log -C $pan_failed &" |
| silent_exec_background $g_ltppan -n page_poisoning -a $pan_zoo -f $g_casedir/page_poisoning -t ${g_duration}s -o $pan_output -l $pan_log -C $pan_failed |
| g_pid_madv=$! |
| end "-- launch page_poisoning test (pid: $g_pid_madv)" |
| } |
| |
| page_poisoning_result() |
| { |
| local fail_num=0 |
| local pass_num=0 |
| local dir=$g_logdir/page_poisoning |
| local result=$dir/page_poisoning.result |
| local log=$dir/page_poisoning.log |
| |
| [ -f $result ] || { |
| result "\tpage_poisoning -- error: no result file there" |
| result "\t details: $log" |
| g_failed=`expr $g_failed + 1` |
| return |
| } |
| fail_num=`grep FAILED $result | wc -l | awk '{print $1}'` |
| pass_num=`grep PASS $result | wc -l | awk '{print $1}'` |
| if [ $fail_num -ne 0 ]; then |
| result "\tpage_poisoning -- $fail_num tests failed, $pass_num tests pass." |
| result "\t details: $result" |
| g_failed=`expr $g_failed + 1` |
| else |
| if [ $pass_num -eq 0 ]; then |
| result "\tpage_poisoning -- no case finished" |
| result "\t details: $log" |
| g_failed=`expr $g_failed + 1` |
| else |
| result "\tpage_poisoning -- all $pass_num tests got pass" |
| fi |
| fi |
| |
| return |
| } |
| |
| run_workloads() |
| { |
| fs_metadata |
| #fs_specific |
| return |
| } |
| |
| _pfn_unpoison() |
| { |
| local pg=$1 |
| |
| echo $pg > $g_debugfs/hwpoison/unpoison-pfn |
| dbp "echo $pg > $g_debugfs/hwpoison/unpoison-pfn" |
| } |
| |
| pfn_unpoison() |
| { |
| local pg_list= |
| local pg=0 |
| local pfn=0 |
| local cur= |
| local i=0 |
| local inj=_pfn_unpoison |
| |
| pg_list=`$g_pagetool -NLrb hwpoison | grep -v offset | cut -f1` |
| for pg in $pg_list |
| do |
| $inj 0x$pg > /dev/null 2>&1 |
| done |
| } |
| |
| show_progress() |
| { |
| local cur= |
| local rest=0 |
| local percent=0 |
| local next=0 |
| local msg="hwpoison page error injection" |
| |
| [ $g_soft_offline -eq 1 ] && msg="page soft offline" |
| |
| cur=`date +%s` |
| [ "$cur" -ge "$g_time_e" ] && return |
| rest=`expr $g_time_e - $cur` |
| let "percent= ($g_duration - $rest) * 100 / $g_duration" |
| [ $percent -eq 0 ] && return |
| if [ $g_recycle -ne 0 ]; then |
| let "g_last=(($percent-$g_percent)*$g_duration)+$g_last" |
| [ $g_last -ge $g_recycle ] && { |
| g_last=0 |
| pfn_unpoison |
| } |
| fi |
| [ $percent -gt 10 ] && let "next= $percent - 10" |
| [ $g_percent -ne 0 -a $g_percent -gt $next ] && return |
| g_percent=$percent |
| log "$msg: $g_percent% pages done" |
| } |
| |
| _pfn_hwpoison() |
| { |
| local pfn=$1 |
| |
| echo $pfn > $g_debugfs/hwpoison/corrupt-pfn |
| dbp "echo $pfn > $g_debugfs/hwpoison/corrupt-pfn" |
| } |
| |
| _pfn_soft_offline() |
| { |
| local pfn=$1 |
| local i |
| local j |
| local paddr |
| |
| i=`printf "%i" $pfn` |
| let "j=$i * $g_pgsize" |
| paddr=`printf "0x%x" $j` |
| echo $paddr > $g_sysfs_mem/soft_offline_page |
| dbp "echo $paddr > $g_sysfs_mem/soft_offline_page" |
| } |
| |
| pfn_inj() |
| { |
| local pg_list= |
| local pg=0 |
| local pfn=0 |
| local cur= |
| local i=0 |
| local inj=_pfn_hwpoison |
| |
| [ $g_soft_offline -eq 1 ] && inj=_pfn_soft_offline |
| if [ $g_pgtype = "all" ]; then |
| pfn=$g_lowmem_s # start from 1M. |
| while [ "$pfn" -lt "$g_maxpfn" ] |
| do |
| pg=`printf "%x" $pfn` |
| $inj 0x$pg > /dev/null 2>&1 |
| pfn=`expr $pfn + 1` |
| [ $pfn -gt $g_lowmem_e ] && pfn=$g_highmem_s |
| [ $pfn -gt $g_highmem_e ] && break |
| i=`expr $i + 1` |
| if [ $i -eq $g_progress ]; then |
| cur=`date +%s` |
| [ "$cur" -ge "$g_time_e" ] && break |
| show_progress |
| i=0 |
| fi |
| done |
| else |
| silent_exec $g_pagetool -Nrb $g_pgtype || err "unsupported pagetype, pls. refer to command: $g_pagetool -h" |
| pg_list=`$g_pagetool -NLrb $g_pgtype | grep -v offset | cut -f1` |
| for pg in $pg_list |
| do |
| $inj 0x$pg > /dev/null 2>&1 |
| i=`expr $i + 1` |
| if [ $i -eq $g_progress ]; then |
| cur=`date +%s` |
| [ "$cur" -ge "$g_time_e" ] && break |
| show_progress |
| i=0 |
| fi |
| done |
| fi |
| } |
| |
| _apei_inj() |
| { |
| local pfn=`printf "%x" $1` |
| local type=$2 |
| |
| echo $type > $g_debugfs/apei/einj/error_type |
| echo "0x${pfn}000" > $g_debugfs/apei/err_inj/error_address |
| echo "1" > $g_debugfs/apei/einj/error_inject |
| } |
| |
| apei_ewb_ucr() |
| { |
| _apei_inj $1 0x2 |
| } |
| |
| apei_mem_ucr() |
| { |
| _apei_inj $1 0x10 |
| } |
| |
| apei_inj() |
| { |
| local pg_list= |
| local pg= |
| local cur= |
| local i=0 |
| |
| pg_list=`$g_pagetool -NLrb $g_pgtype | grep -v offset | cut -f1` |
| for pg in $pg_list |
| do |
| apei_mem_ucr $pg |
| i=`expr $i + 1` |
| if [ $i -eq $g_progress ]; then |
| cur=`date +%s` |
| [ "$cur" -ge "$g_time_e" ] && break |
| show_progress |
| i=0 |
| fi |
| done |
| |
| return |
| } |
| |
| err_inject() |
| { |
| local cur= |
| local i=0 |
| local msg="hwpoison page error injection" |
| local MSG="inject HWPOISON error to pages" |
| |
| if [ $g_soft_offline -eq 1 ]; then |
| msg="page soft offline" |
| MSG="soft OFFLINE pages" |
| fi |
| if [ $g_madvise -eq 1 ]; then |
| begin "$MSG thru madvise syscall" |
| else |
| begin "$MSG ($g_pgtype)" |
| fi |
| let "g_progress=$g_duration * 10" |
| g_time_s=`date +%s` |
| g_time_e=`expr $g_time_s + $g_duration` |
| cur=$g_time_s |
| if [ $g_madvise -eq 1 ]; then |
| page_poisoning |
| log "$msg: 0% pages done" |
| show_progress |
| else |
| log "$msg: 0% pages done" |
| fi |
| while [ "$cur" -lt "$g_time_e" ] |
| do |
| if [ $g_madvise -eq 0 ]; then |
| show_progress |
| [ $g_apei -eq 1 ] && apei_inj |
| [ $g_pfninj -eq 1 ] && pfn_inj |
| else |
| if [ $i -eq $g_progress ]; then |
| show_progress |
| i=0 |
| fi |
| i=`expr $i + 1` |
| fi |
| cur=`date +%s` |
| done |
| log "$msg: 100% pages done" |
| # wait workloads to be finished. |
| sleep $g_interval |
| |
| if [ $g_madvise -eq 1 ]; then |
| end "$MSG thru madvise syscall" |
| else |
| end "$MSG ($g_pgtype)" |
| fi |
| } |
| |
| fsck_err() |
| { |
| local dir=$g_logdir/fsck |
| local result=$dir/fsck.result |
| local log=$dir/fsck.log |
| |
| echo "FAILED: $@" > $result |
| echo "FAILED: $@" > $log |
| } |
| |
| fsck_pass() |
| { |
| local dir=$g_logdir/fsck |
| local result=$dir/fsck.result |
| local log=$dir/fsck.log |
| |
| echo "PASS: $@" > $result |
| echo "PASS: $@" > $log |
| } |
| |
| run_fsck() |
| { |
| local dir=$g_logdir/fsck |
| local result=$dir/fsck.result |
| local log=$dir/fsck.log |
| local fsck=fsck.$g_fstype |
| local opts="" |
| |
| mkdir -p $dir |
| echo -n "" > $log |
| echo -n "" > $result |
| |
| [ $g_fstype = "btrfs" ] && fsck="btrfsck" |
| [ $g_fstype = "reiserfs" ] && { |
| fsck="reiserfsck" |
| opts="-y" |
| } |
| begin "launch $fsck on $g_dev to check test result" |
| silent_exec which $fsck || { |
| fsck_err "fsck: unsupported fstype: $g_fstype" |
| return |
| } |
| fs_sync |
| silent_exec umount -f $g_dev || sleep $g_interval |
| df | grep $g_dev > /dev/null 2>&1 |
| if [ $? -eq 0 ]; then |
| silent_exec umount $g_dev || { |
| fsck_err "cannot umount $g_dev to do $fsck" |
| return |
| } |
| fi |
| $fsck $opts $g_dev || fsck_err "err #$? while $fsck on $g_dev" |
| silent_exec mount -t $g_fstype $g_dev $g_testdir || { |
| fsck_err "cannot mount $g_testdir back after fsck_check" |
| return |
| } |
| fsck_pass "$fsck got pass on $g_dev" |
| end "launch $fsck on $g_dev to check test result" |
| } |
| |
| fsck_result() |
| { |
| local dir=$g_logdir/fsck |
| local result=$dir/fsck.result |
| local log=$dir/fsck.log |
| local fail_num=0; |
| local pass_num=0; |
| [ -f $result ] || { |
| result "\tfsck.$g_fstype -- no result found" |
| result "\t details: $log" |
| g_failed=`expr $g_failed + 1` |
| return |
| } |
| |
| fail_num=`grep FAILED $result | wc -l | awk '{print $1}'` |
| pass_num=`grep PASS $result | wc -l | awk '{print $1}'` |
| if [ $fail_num -ne 0 ]; then |
| result "\tfsck.$g_fstype -- failed" |
| result "\t log: $log" |
| g_failed=`expr $g_failed + 1` |
| else |
| if [ $pass_num -eq 0 ]; then |
| result "\tfsck.$g_fstype -- not executed" |
| result "\t log: $log" |
| g_failed=`expr $g_failed + 1` |
| else |
| result "\tfsck.$g_fstype -- fsck on $g_dev got pass" |
| fi |
| fi |
| } |
| |
| result_check() |
| { |
| begin "-- collecting test result" |
| result "#############################################" |
| result "result summary:" |
| if [ $g_madvise -eq 1 ]; then |
| page_poisoning_result |
| else |
| fs_metadata_result |
| [ $g_runltp -eq 1 ] && ltp_result |
| fi |
| [ $g_netfs -eq 0 -a $g_test -eq 0 ] && fsck_result |
| result "" |
| result "totally $g_failed task-groups report failures" |
| result "#############################################" |
| end "-- collecting test result" |
| } |
| |
| usage() |
| { |
| echo "Usage: ./hwpoison.sh -d /dev/device [-options] [arguments]" |
| echo |
| echo "Stress Testing for Linux MCA High Level Handlers: " |
| echo -e "\t-c console\t: target tty console to print test log" |
| echo -e "\t-d device\t: target block device to run test on" |
| echo -e "\t-f fstype\t: filesystem type to be tested" |
| echo -e "\t-i interval\t: sleep interval (default is $g_interval seconds)" |
| echo -e "\t-l logfile\t: log file" |
| echo -e "\t-n netdev\t: target network disk to run test on" |
| echo -e "\t-o ltproot\t: ltp root directory (default is $g_ltproot/)" |
| echo -e "\t-p pagetype\t: page type to inject error " |
| echo -e "\t-r result\t: result file" |
| echo -e "\t-s pagesize\t: page size on the system (default is $g_pgsize bytes)" |
| echo -e "\t-t duration\t: test duration time (default is $g_duration seconds)" |
| echo -e "\t-A \t\t: use APEI to inject error" |
| echo -e "\t-C children\t: process num of workloads" |
| echo -e "\t-F \t\t: execute as force mode, no interaction with user" |
| echo -e "\t-L \t\t: run ltp in background" |
| echo -e "\t-M \t\t: run page_poisoning test thru madvise syscall" |
| echo -e "\t-N \t\t: do not mkfs target block device" |
| echo -e "\t-R recyle\t: automatically unpoison pages after running recyle seconds" |
| echo -e "\t-S \t\t: test soft page offline" |
| echo -e "\t-T \t\t: test mode, run test in local dir other than on target device" |
| echo -e "\t-V \t\t: verbose mode, show debug info" |
| echo -e "\t-h \t\t: print this page" |
| echo |
| echo -e "device:" |
| echo -e "\tthis is a mandatory argument. typically, it's a disk partition." |
| echo -e "\tall temporary files will be created on this device." |
| echo -e "\terror injector will just inject errors to the pages associated" |
| echo -e "\twith this device (except for the testing thru madvise syscall)." |
| echo |
| echo -e "pagetype:" |
| echo -e "\tdefault page type:" |
| echo -e "\t $g_pgtype" |
| echo -e "\tfor more details, pls. try \`page-types -h\`." |
| echo -e "\tsee the definition of \"bits-spec\"." |
| echo |
| echo -e "console:" |
| echo -e "\ttest can print output to the console you specified." |
| echo -e "\te.g. '-c /dev/tty1'" |
| echo |
| |
| exit 0 |
| } |
| |
| fs_sync() |
| { |
| log "now to sync up the disk under testing, might need several minutes ..." |
| sync |
| } |
| |
| stop_children() |
| { |
| begin "-- cleaning up remaining tasks in background" |
| if [ -n "$g_pid_madv" ]; then |
| silent_exec ps $g_pid_madv |
| [ $? -eq 0 ] && { |
| kill -15 $g_pid_madv > /dev/null 2>&1 |
| sleep $g_interval |
| } |
| fi |
| if [ -n "$g_pid_fsmeta" ]; then |
| silent_exec ps $g_pid_fsmeta |
| [ $? -eq 0 ] && { |
| kill -15 $g_pid_fsmeta > /dev/null 2>&1 |
| sleep $g_interval |
| } |
| fi |
| if [ -n "$g_pid_ltp" ]; then |
| silent_exec ps $g_pid_ltp |
| [ $? -eq 0 ] && { |
| kill -15 $g_pid_ltp > /dev/null 2>&1 |
| sleep $g_interval |
| } |
| fi |
| end "-- cleaning up remaining tasks in background" |
| } |
| |
| cleanup() |
| { |
| log "!!! EXIT signal received, need to exit testing now. !!!" |
| begin "preparing to complete testing" |
| stop_children |
| fs_sync |
| result_check |
| if [ $g_netfs -eq 0 ]; then |
| df | grep $g_dev > /dev/null 2>&1 && silent_exec umount -f $g_dev |
| else |
| df | grep $g_netdev > /dev/null 2>&1 && silent_exec umount -f $g_netdev |
| fi |
| if [ $g_madvise -eq 1 ]; then |
| echo $g_vm_dirty_background_ratio > /proc/sys/vm/dirty_background_ratio |
| echo $g_vm_dirty_ratio > /proc/sys/vm/dirty_ratio |
| echo $g_vm_dirty_expire_centisecs > /proc/sys/vm/dirty_expire_centisecs |
| fi |
| end "preparing to complete testing" |
| log "!!! Linux HWPOISON stress testing DONE !!!" |
| log "result: $g_result" |
| log "log: $g_logfile" |
| if [ $g_failed -ne 0 ]; then |
| exit 1 |
| else |
| exit 0 |
| fi |
| } |
| |
| select_injector() |
| { |
| # for test mode, apei injector is not supported. |
| if [ $g_test -eq 1 ]; then |
| [ $g_apei -eq 1 ] && g_apei=0 |
| if [ $g_madvise -eq 1 ]; then |
| g_pfninj=0 |
| else |
| g_soft_offline=1 |
| fi |
| fi |
| |
| # for non-test mode, apei injector is 1st priority. |
| if [ $g_apei -eq 1 ]; then |
| g_pfninj=0 |
| g_madvise=0 |
| fi |
| |
| if [ $g_madvise -eq 1 ]; then |
| g_pfninj=0 |
| fi |
| } |
| |
| g_children=0 # child process num for each workload. |
| # 0 means using default child process num of each workload. |
| g_dev= |
| g_debugfs= |
| g_netdev= |
| g_testdir="./hwpoison" |
| g_fstype=ext3 |
| g_netfs=0 |
| g_nomkfs=0 |
| g_force=0 |
| let "g_duration=120" |
| g_interval=5 |
| g_runltp=0 |
| g_ltproot="/ltp" |
| g_ltppan="$g_ltproot/pan/ltp-pan" |
| g_pagetool="page-types" |
| g_madvise=0 |
| g_apei=0 |
| g_pfninj=1 |
| g_rootdir=`pwd` |
| g_bindir=$g_rootdir/bin |
| g_casedir=$g_rootdir/runtest |
| g_logdir=$g_rootdir/log |
| g_resultdir=$g_rootdir/result |
| g_logfile=$g_resultdir/hwpoison.log |
| g_result=$g_resultdir/hwpoison.result |
| g_failed=0 |
| g_time_s= |
| g_time_e= |
| g_tty=`tty` |
| g_pid_madv= |
| g_pid_fsmeta= |
| g_pid_ltp= |
| g_progress= |
| g_percent=0 |
| g_pgtype="lru,referenced,readahead,swapcache,swapbacked,anonymous" |
| g_pgsize=4096 # page size on the system |
| g_maxpfn= # maxpfn on the system |
| g_highmem_s= # start pfn of highmem |
| g_highmem_e= # end pfn of highmem |
| g_lowmem_s= # start pfn of mem < 4G |
| g_lowmem_e= # end pfn of mem < 4G |
| g_sysfs_mem="/sys/devices/system/memory" |
| g_soft_offline=0 |
| g_test=0 |
| |
| # recyle poisoned page |
| g_recycle=0 |
| g_last=0 |
| |
| # madvise injector specific global variable |
| g_vm_dirty_background_ratio=`cat /proc/sys/vm/dirty_background_ratio` |
| g_vm_dirty_ratio=`cat /proc/sys/vm/dirty_ratio` |
| g_vm_dirty_expire_centisecs=`cat /proc/sys/vm/dirty_expire_centisecs` |
| |
| # test parameters |
| g_parameter=$@ |
| |
| while getopts ":c:d:f:hi:l:n:o:p:r:s:t:C:LMR:STAFNV" option |
| do |
| case $option in |
| c) g_tty=$OPTARG;; |
| d) g_dev=$OPTARG;; |
| f) g_fstype=$OPTARG;; |
| l) g_logfile=$OPTARG;; |
| t) g_duration=$OPTARG;; |
| i) g_interval=$OPTARG;; |
| n) g_netdev=$OPTARG;; |
| o) g_ltproot=$OPTARG |
| g_ltppan="$g_ltproot/pan/ltp-pan";; |
| p) g_pgtype=$OPTARG;; |
| s) g_pgsize=$OPTARG;; |
| r) g_result=$OPTARG;; |
| C) g_children=$OPTARG;; |
| L) g_runltp=1;; |
| M) g_madvise=1;; |
| R) g_recycle=$OPTARG;; |
| S) g_soft_offline=1;; |
| T) g_test=1;; |
| A) g_apei=1;; |
| F) g_force=1;; |
| N) g_nomkfs=1;; |
| V) DEBUG=1;; |
| h) usage;; |
| *) invalid "invalid option";; |
| esac |
| done |
| |
| select_injector |
| setup_log |
| log "!!! Linux HWPOISON stress testing starts NOW !!!" |
| log "!!! test will run about $g_duration seconds !!!" |
| setup_env |
| if [ $g_madvise -eq 0 ]; then |
| [ $g_runltp -eq 1 ] && run_ltp |
| run_workloads |
| fi |
| err_inject |
| [ $g_netfs -eq 0 -a $g_test -eq 0 ] && run_fsck |