1#!/bin/bash 2 3# Copyright (C) 2018 Oracle. All Rights Reserved. 4# 5# Author: Darrick J. Wong <darrick.wong@oracle.com> 6# 7# This program is free software; you can redistribute it and/or 8# modify it under the terms of the GNU General Public License 9# as published by the Free Software Foundation; either version 2 10# of the License, or (at your option) any later version. 11# 12# This program is distributed in the hope that it would be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with this program; if not, write the Free Software Foundation, 19# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 20 21# Automatically check an LVM-managed filesystem online. 22# We use lvm snapshots to do this, which means that we can only 23# check filesystems in VGs that have at least 256MB (or so) of 24# free space. 25 26PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin 27 28if (( $EUID != 0 )); then 29 echo "e2scrub must be run as root" 30 exit 1 31fi 32 33snap_size_mb=256 34fstrim=0 35reap=0 36e2fsck_opts="" 37conffile="@root_sysconfdir@/e2scrub.conf" 38 39test -f "${conffile}" && . "${conffile}" 40 41print_help() { 42 echo "Usage: $0 [OPTIONS] mountpoint | device" 43 echo 44 echo "mountpoint must be on an LVM-managed block device" 45 echo "-n: Show what commands e2scrub would execute." 46 echo "-r: Remove e2scrub snapshot and exit, do not check anything." 47 echo "-t: Run fstrim if successful." 48 echo "-V: Print version information and exit." 49} 50 51print_version() { 52 echo "e2scrub @E2FSPROGS_VERSION@ (@E2FSPROGS_DATE@)" 53} 54 55exitcode() { 56 ret="$1" 57 58 # If we're being run as a service, the return code must fit the LSB 59 # init script action error guidelines, which is to say that we 60 # compress all errors to 1 ("generic or unspecified error", LSB 5.0 61 # section 22.2) and hope the admin will scan the log for what 62 # actually happened. 63 64 # We have to sleep 2 seconds here because journald uses the pid to 65 # connect our log messages to the systemd service. This is critical 66 # for capturing all the log messages if the scrub fails, because the 67 # fail service uses the service name to gather log messages for the 68 # error report. 69 if [ -n "${SERVICE_MODE}" -a "${ret}" -ne 0 ]; then 70 test "${ret}" -ne 0 && ret=1 71 sleep 2 72 fi 73 74 exit "${ret}" 75} 76 77while getopts "nrtV" opt; do 78 case "${opt}" in 79 "n") DBG="echo Would execute: " ;; 80 "r") reap=1;; 81 "t") fstrim=1;; 82 "V") print_version; exitcode 0;; 83 *) print_help; exitcode 2;; 84 esac 85done 86shift "$((OPTIND - 1))" 87 88arg="$1" 89if [ -z "${arg}" ]; then 90 print_help 91 exitcode 1 92fi 93 94if ! type lsblk >& /dev/null ; then 95 echo "e2scrub: can't find lsblk --- is util-linux installed?" 96 exitcode 1 97fi 98 99if ! type lvcreate >& /dev/null ; then 100 echo "e2scrub: can't find lvcreate --- is lvm2 installed?" 101 exitcode 1 102fi 103 104# close file descriptor 3 (from cron) since it causes lvm to kvetch 105exec 3<&- 106 107# Find the device for a given mountpoint 108dev_from_mount() { 109 local mountpt="$(realpath "$1")" 110 111 lsblk -o NAME,FSTYPE,MOUNTPOINT -p -P -n 2> /dev/null | while read vars; do 112 eval "${vars}" 113 if [ "${mountpt}" != "${MOUNTPOINT}" ]; then 114 continue 115 fi 116 case "${FSTYPE}" in 117 ext[234]) 118 echo "${NAME}" 119 return 0 120 ;; 121 esac 122 done 123 return 1 124} 125 126# Check a device argument 127dev_from_arg() { 128 local dev="$1" 129 local fstype="$(lsblk -o FSTYPE -n "${dev}" 2> /dev/null)" 130 131 case "${fstype}" in 132 ext[234]) 133 echo "${dev}" 134 return 0 135 ;; 136 esac 137 return 1 138} 139 140mnt_from_dev() { 141 local dev="$1" 142 143 if [ -n "${dev}" ]; then 144 lsblk -o MOUNTPOINT -n "${dev}" 145 fi 146} 147 148# Construct block device path and mountpoint from argument 149if [ -b "${arg}" ]; then 150 dev="$(dev_from_arg "${arg}")" 151 mnt="$(mnt_from_dev "${dev}")" 152else 153 dev="$(dev_from_mount "${arg}")" 154 mnt="${arg}" 155fi 156if [ ! -e "${dev}" ]; then 157 echo "${arg}: Not an ext[234] filesystem." 158 print_help 159 exitcode 16 160fi 161 162# Make sure this is an LVM device we can snapshot 163lvm_vars="$(lvs --nameprefixes -o name,vgname,lv_role --noheadings "${dev}" 2> /dev/null)" 164eval "${lvm_vars}" 165if [ -z "${LVM2_VG_NAME}" ] || [ -z "${LVM2_LV_NAME}" ] || 166 echo "${LVM2_LV_ROLE}" | grep -q "snapshot"; then 167 echo "${arg}: Not connnected to an LVM logical volume." 168 print_help 169 exitcode 16 170fi 171start_time="$(date +'%Y%m%d%H%M%S')" 172snap="${LVM2_LV_NAME}.e2scrub" 173snap_dev="/dev/${LVM2_VG_NAME}/${snap}" 174 175teardown() { 176 # Remove and wait for removal to succeed. 177 ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 178 while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ]; do 179 sleep 0.5 180 ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 181 done 182} 183 184check() { 185 # First we recover the journal, then we see if e2fsck tries any 186 # non-optimization repairs. If either of these two returns a 187 # non-zero status (errors fixed or remaining) then this fs is bad. 188 E2FSCK_FIXES_ONLY=1 189 export E2FSCK_FIXES_ONLY 190 ${DBG} "@root_sbindir@/e2fsck" -E journal_only -p ${e2fsck_opts} "${snap_dev}" || return $? 191 ${DBG} "@root_sbindir@/e2fsck" -f -y ${e2fsck_opts} "${snap_dev}" 192} 193 194mark_clean() { 195 ${DBG} "@root_sbindir@/tune2fs" -C 0 -T "${start_time}" "${dev}" 196} 197 198mark_corrupt() { 199 ${DBG} "@root_sbindir@/tune2fs" -E force_fsck "${dev}" 200} 201 202setup() { 203 # Try to remove snapshot for 30s, bail out if we can't remove it. 204 lvremove_deadline="$(( $(date "+%s") + 30))" 205 ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 2>/dev/null 206 while [ -e "${snap_dev}" ] && [ "$?" -eq "5" ] && 207 [ "$(date "+%s")" -lt "${lvremove_deadline}" ]; do 208 sleep 0.5 209 ${DBG} lvremove -f "${LVM2_VG_NAME}/${snap}" 210 done 211 if [ -e "${snap_dev}" ]; then 212 echo "${arg}: e2scrub snapshot is in use, cannot check!" 213 return 1 214 fi 215 # Create the snapshot, wait for device to appear. 216 ${DBG} lvcreate -s -L "${snap_size_mb}m" -n "${snap}" "${LVM2_VG_NAME}/${LVM2_LV_NAME}" 217 if [ $? -ne 0 ]; then 218 echo "${arg}: e2scrub snapshot FAILED, will not check!" 219 return 1 220 fi 221 ${DBG} udevadm settle 2> /dev/null 222 return 0 223} 224 225if [ "${reap}" -gt 0 ]; then 226 if [ -e "${snap_dev}" ]; then 227 teardown 2> /dev/null 228 fi 229 exit 0 230fi 231if ! setup; then 232 exitcode 8 233fi 234trap "teardown; exit 1" EXIT INT QUIT TERM 235 236# Check and react 237check 238case "$?" in 239"0") 240 # Clean check! 241 echo "${arg}: Scrub succeeded." 242 mark_clean 243 teardown 244 trap '' EXIT 245 246 # Trim the free space, which requires the snapshot be deleted. 247 if [ "${fstrim}" -eq 1 ] && [ -d "${mnt}" ] && type fstrim > /dev/null 2>&1; then 248 echo "${arg}: Trimming free space." 249 fstrim -v "${mnt}" 250 fi 251 252 ret=0 253 ;; 254"8") 255 # Operational error, what now? 256 echo "${arg}: e2fsck operational error." 257 teardown 258 trap '' EXIT 259 ret=8 260 ;; 261*) 262 # fsck failed. Check if the snapshot is invalid; if so, make a 263 # note of that at the end of the log. This isn't necessarily a 264 # failure because the mounted fs could have overflowed the 265 # snapshot with regular disk writes /or/ our repair process 266 # could have done it by repairing too much. 267 # 268 # If it's really corrupt we ought to fsck at next boot. 269 is_invalid="$(lvs -o lv_snapshot_invalid --noheadings "${snap_dev}" | awk '{print $1}')" 270 if [ -n "${is_invalid}" ]; then 271 echo "${arg}: Scrub FAILED due to invalid snapshot." 272 ret=8 273 else 274 echo "${arg}: Scrub FAILED due to corruption! Unmount and run e2fsck -y." 275 mark_corrupt 276 ret=6 277 fi 278 teardown 279 trap '' EXIT 280 ;; 281esac 282 283exitcode "${ret}" 284