-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbtrfs-scrub-slowly
executable file
·335 lines (296 loc) · 10.9 KB
/
btrfs-scrub-slowly
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/bin/bash
# Copyright (C) 2016-2018 Graham R. Cobb
# Released under GPL V2 -- see LICENSE
# Note: the master version of this file is in git but it has to be checked
# and then stored somewhere like /usr/local/sbin/ to be safe to be run automatically by root.
# Never have root automatically run programs/scripts that can be modified by non-root users.
# btrfs-scrub-slowly [options] <disk>
#
# Do a btrfs scrub with frequent breaks.
#
# This script is designed to run a btrfs scrub in portions.
# There can be delays between portions, and there are hooks to allow services
# to be stopped during the scrub portion and restarted again afterwards.
#
# GRC - on my systems, check there is not a new version to be deployed
[ -x /usr/local/bin/check-usr-local-sbin ] && /usr/local/bin/check-usr-local-sbin "$0"
interval=$((10*60))
limit=0
tlimit=0
hook="true"
portion_tlimit=$((1*60*60))
debug=0
usage() {
echo "Usage: $0 " '[options] <disk>'
echo $'\t<disk>\tdisk to scrub, for example: /mnt/backup2'
echo $'Options:'
echo $'\t-l | --limit\t<count> - maximum number of portions, default' $limit
echo $'\t-t | --time\t<seconds> - maximum total time (in seconds), default' $tlimit
echo $'\t-T | --portion-time\t<seconds> - timeout for each scrub portion (in seconds), default' $portion_tlimit
echo $'\t-i | --interval\t<seconds> - delay between portions, default' $interval
echo $'\t-h | --hook\t<command-or-function> - command to execute before/after portion of scrub, default "true"'
echo $'\t-D | --debug - print additional debugging information'
}
while [ $# -gt 1 ]
do
if [ "$1" = "-l" -o "$1" = "--limit" ]
then
limit=$2
shift 2
elif [ "$1" = "-t" -o "$1" = "--time" ]
then
tlimit=$2
shift 2
elif [ "$1" = "-T" -o "$1" = "--portion-time" ]
then
portion_tlimit=$2
shift 2
elif [ "$1" = "-i" -o "$1" = "--interval" ]
then
interval=$2
shift 2
elif [ "$1" = "-h" -o "$1" = "--hook" ]
then
hook=$2
shift 2
elif [ "$1" = "-D" -o "$1" = "--debug" ]
then
debug=1
shift
else
echo "Unrecognised option $1"
usage
exit 98
fi
done
if [ $# -ne 1 ]
then
echo "Insufficient arguments"
usage
exit 97
fi
# total lines of decimal numbers - for adding each device together
total-lines() {
local num total=0
while read num
do
((total+=num))
done
echo $total
}
disk=$1
fs_uuid=$(btrfs fi show "$disk" | grep -o 'uuid: \+[-0-9a-f]\+' | cut -d ' ' -f 2)
fs_size=$(btrfs fi show --raw "$fs_uuid" | grep -o 'size \+[-0-9]\+' | cut -d ' ' -f 2 | total-lines)
# Calculate how many bytes have been scrubbed, for log messages and for protecting against runaways
calc-bytes-scrubbed() {
btrfs scrub status -dR $1 | fgrep _bytes_scrubbed | grep -o '[0-9]\+' | total-lines
}
# Debug code to check that last_physical is non-zero. UUID is passed as $1.
# Note: this check is only valid if the scrub has been cancelled or finished.
# If the scrub is running, this file may not be completely up to date.
check-last-physical() {
local status_file_base='/var/lib/btrfs/scrub.status.'
local status_file="${status_file_base}$1"
if grep -F --quiet '|last_physical:0|' "$status_file"
then
echo "%%%ERROR: last_physical is 0"
cat "$status_file"
return 1
fi
return 0
}
# This function implements waits while showing the time remaining in the process name used by ps
sleep-showing-remaining-seconds() {
# $1 - time to sleep IN SECONDS
# $2 - message
for ((i=$1; i-- ; ))
do
hours="$(($i/3600)):"
[ "$hours" != "0:" ] || hours=""
mins="$((($i%3600)/60)):"
secs="$(($i%60))"
bash -c "exec -a \"$2 ($hours$mins$secs), sleep\" sleep 1s"
done
}
start-portion-timer() {
test $portion_tlimit -eq 0 && return
# Run a timer and cancel the scrub if it expires
echo "Portion timer ($portion_tlimit seconds) starting"
( sleep-showing-remaining-seconds $portion_tlimit Scrubbing
# for ((i=portion_tlimit; i-- ; )) ; do sleep 1s ; done
echo "Portion timeout ($portion_tlimit seconds) - scrub will be cancelled"
[ $debug -ne 0 ] && echo '>>>>' && echo "Scrub status just before cancel..." && btrfs scrub status -dR $disk && btrfs scrub status -d $disk && echo '<<<<'
btrfs scrub cancel $disk
ret=$?
# Let the scrub exit
# WARNING: these must not take so long that stop-portion-time can drop through to its kill command before we have exited
# See the comment in stop-portion-timer before changing the following few lines.
sleep 5s
[ $debug -ne 0 ] && echo '>>>>' && echo "Scrub status just after cancel..." && btrfs scrub status -dR $disk && btrfs scrub status -d $disk && echo '<<<<'
[ $debug -ne 0 ] && check-last-physical $fs_uuid
exit $ret
) &
portion_timer=$!
}
stop-portion-timer() {
# Note: if return value is 129 the portion timer is not configured
# if return value is 143, then the kill killed the portion timer while it was still running
# (scrub finished before portion timer finished)
# If it is <= 128 it is the return value from the scrub cancel in the portion timer
test $portion_tlimit -eq 0 && return 129
# Reduce race with portion timer finishing cancel processing (and debug output)
# WARNING - the sleep below must be long enough for the "scrub exit" actions in start-portion-timer
sleep 10s
kill $portion_timer 2>/dev/null
wait $portion_timer 2>/dev/null
}
run-portion() {
# Run one portion. Returns success if should be run again
pre-scrub-hook "$@"
hookstatus=$?
if [ $hookstatus -ne 0 ]
then
echo "Pre-scrub hook failed ($hookstatus)"
return $hookstatus
fi
echo "Scrub status before $startorresume..."
echo '>>>>' && btrfs scrub status -dR $disk && btrfs scrub status -d $disk && echo '<<<<'
[ $debug -ne 0 -a "$count" -gt 1 ] && check-last-physical $fs_uuid
echo "scrub $count $startorresume ..."
start-portion-timer
outfile="$(mktemp)"
command time -v nice -10 btrfs scrub $startorresume -Bd -c 3 "$@" >$outfile
runstatus=$?
stop-portion-timer
timerstatus=$?
cat $outfile ; echo
[ $debug -ne 0 ] && echo "actual runstatus=$runstatus, timerstatus=$timerstatus"
# Next run will be a "resume"
startorresume="resume"
# Scrub status 0 means the scrub finished, which means do not try again
if [ $runstatus -eq 0 ]
then
runstatus=255
fi
# Scrub status 1 and timer status <= 128 means the scrub ran out of time
# and was cancelled, which means try again
if [ $runstatus -eq 1 -a $timerstatus -le 128 ]
then
runstatus=0
fi
post-scrub-hook "$@"
hookstatus=$?
if [ $hookstatus -ne 0 ]
then
echo "Post-scrub hook failed ($hookstatus)"
runstatus=$hookstatus
fi
[ $debug -ne 0 ] && echo "modified runstatus=$runstatus"
# Debugging bad last_physical
if [ $debug -ne 0 ]
then
# This check is being tested to see if it can replace the one below
check-last-physical $fs_uuid
# If we are going to resume, at least one last_physical must be non-zero
if [ $runstatus -eq 0 ]
then
last_physical=0
btrfs scrub status -dR $disk | fgrep last_physical | grep -o '[0-9]\+' |
{
while read num
do
((last_physical+=num))
done
if [ $last_physical -eq 0 ]
then
echo "%%%ERROR: Resuming with all last_physical equal to 0!!"
echo '>>>>' && btrfs scrub status -dR $disk && btrfs scrub status -d $disk && echo '<<<<'
fi
}
fi
fi
rm $outfile
return $runstatus
}
pre-scrub-hook() {
$hook "pre" "$@"
}
post-scrub-hook() {
$hook "post" "$@"
}
# Hook function to stop mail handling during the scrub
# Used for main data disk
hook-nomail() {
retstatus=0
if [ "$1" = "pre" ]
then
systemctl stop fetchmail && systemctl stop postfix && systemctl stop dovecot
retstatus=$?
if [ $retstatus -ne 0 ]
then
# Stopping services failed. Try to start them again
logger -s -t $0 "Stopping fetchmail, postfix or dovecot failed. Trying restarting."
systemctl start dovecot
systemctl start postfix
systemctl start fetchmail
fi
elif [ "$1" = "post" ]
then
systemctl start dovecot
tempstatus=$?
if [ $tempstatus -ne 0 ]
then
logger -s -t $0 "Could not restart dovecot: $tempstatus"
retstatus=$tempstatus
# Dovecot has failed, we used to just go on and try to restart postfix and fetchmail anyway
# but now that postifx depends on dovecot-lmtp we don't
return $retstatus
fi
systemctl start postfix
tempstatus=$?
if [ $tempstatus -ne 0 ]
then
logger -s -t $0 "Could not restart postfix: $tempstatus"
retstatus=$tempstatus
# Postfix has failed. Do not restart fetchmail, just return
return $retstatus
fi
systemctl start fetchmail
tempstatus=$?
if [ $tempstatus -ne 0 ]
then
logger -s -t $0 "Could not restart fetchmail: $tempstatus"
retstatus=$tempstatus
fi
else
echo "$0 internal error: invalid hook argument $1"
retstatus=97
fi
return $retstatus
}
echo "Starting scrub on $disk ($fs_size bytes)"
portion_start="$(date --utc +%s)"
startorresume="start"
seconds=0
count=1
while
run-portion "$@"
do
portion_end="$(date --utc +%s)"
((seconds += 1 + portion_end - portion_start))
[ -n "$limit" ] && [ "$limit" -gt 0 ] && [ "$count" -ge "$limit" ] && echo "Limit reached: $limit" && break
[ -n "$tlimit" ] && [ "$tlimit" -gt 0 ] && [ "$seconds" -ge "$tlimit" ] && echo "Time limit reached: $tlimit seconds" && break
# Work out number of bytes scrubbed so far
scrubbed=$(calc-bytes-scrubbed $disk)
[ $debug -ne 0 ] && echo "Scrubbed bytes: $scrubbed of $fs_size ($((100*scrubbed/fs_size))%)"
[[ "$scrubbed" -gt $((2*fs_size)) ]] && echo "ERROR: Scrubbed bytes ($scrubbed) more than twice disk total size ($fs_size)!!" && break
[ $debug -ne 0 ] && echo "Scrub status before sleeping..." && echo '>>>>' && btrfs scrub status -dR $disk && btrfs scrub status -d $disk && echo '<<<<'
sleep-showing-remaining-seconds ${interval} Paused
portion_start="$(date --utc +%s)"
((count++))
done
portion_end="$(date --utc +%s)"
((seconds += 1 + portion_end - portion_start))
((count++))
btrfs scrub status -dR $disk && btrfs scrub status -d $disk
echo "Scrub completed in $count portions, $seconds seconds"