forked from zhangnq/nagios
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_iostat.sh
316 lines (268 loc) · 10.1 KB
/
check_iostat.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/bin/bash
#
#http://exchange.nagios.org/directory/Plugins/Operating-Systems/Linux/check_iostat--2D-I-2FO-statistics/details
#----------check_iostat.sh-----------
#
# Version 0.0.2 - Jan/2009
# Changes: added device verification
#
# by Thiago Varela - thiago@iplenix.com
#
# Version 0.0.3 - Dec/2011
# Changes:
# - changed values from bytes to mbytes
# - fixed bug to get traffic data without comma but point
# - current values are displayed now, not average values (first run of iostat)
#
# by Philipp Niedziela - pn@pn-it.com
#
# Version 0.0.4 - April/2014
# Changes:
# - Allow Empty warn/crit levels
# - Can check I/O, WAIT Time, or Queue
#
# by Warren Turner
#
# Version 0.0.5 - Jun/2014
# Changes:
# - removed -y flag from call since iostat doesn't know about it any more (June 2014)
# - only needed executions of iostat are done now (save cpu time whenever you can)
# - fixed the obvious problems of missing input values (probably because of the now unimplemented "-y") with -x values
# - made perfomance data optional (I like to have choice in the matter)
#
# by Frederic Krueger / fkrueger-dev-checkiostat@holics.at
#
# Version 0.0.6 - Jul/2014
# Changes:
# - Cleaned up argument checking, removed excess iostat calls, steamlined if statements and renamed variables to fit current use
# - Fixed all inputs to match current iostat output (Ubuntu 12.04)
# - Changed to take last ten seconds as default (more useful for nagios usage). Will go to "since last reboot" (previous behaviour) on -g flag.
# - added extra comments/whitespace etc to make add readability
#
# by Ben Field / ben.field@concreteplatform.com
#
# Version 0.0.7 - Sep/2014
# Changes:
# - Fixed performance data for Wait check
#
# by Christian Westergard / christian.westergard@gmail.com
#
iostat=`which iostat 2>/dev/null`
bc=`which bc 2>/dev/null`
function help {
echo -e "
Usage:
-d =
--Device to be checked. Example: \"-d sda\"
Run only one of i, q, W:
-i = IO Check Mode
--Checks Total Transfers/sec, Read IO/Sec, Write IO/Sec, Bytes Read/Sec, Bytes Written/Sec
--warning/critical = Total Transfers/sec,Read IO/Sec,Write IO/Sec,Bytes Read/Sec,Bytes Written/Sec
-q = Queue Mode
--Checks Disk Queue Lengths
--warning/critial = Average size of requests, Queue length of requests
-W = Wait Time Mode
--Check the time for I/O requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.
--warning/critical = Avg I/O Wait Time (ms), Avg Read Wait Time (ms), Avg Write Wait Time (ms), Avg Service Wait Time (ms), Avg CPU Utilization
-w,-c = pass warning and critical levels respectively. These are not required, but with out them, all queries will return as OK.
-p = Provide performance data for later graphing
-g = Since last reboot for system (more for debugging that nagios use!)
-h = This help
"
exit -1
}
# Ensuring we have the needed tools:
( [ ! -f $iostat ] || [ ! -f $bc ] ) && \
( echo "ERROR: You must have iostat and bc installed in order to run this plugin\n\tuse: apt-get install systat bc\n" && exit -1 )
io=0
queue=0
waittime=0
printperfdata=0
STATE="OK"
samples=2i
status=0
MSG=""
PERFDATA=""
#------------Argument Set-------------
while getopts "d:w:c:ipqWhg" OPT; do
case $OPT in
"d") disk=$OPTARG;;
"w") warning=$OPTARG;;
"c") critical=$OPTARG;;
"i") io=1;;
"p") printperfdata=1;;
"q") queue=1;;
"W") waittime=1;;
"g") samples=1;;
"h") echo "help:" && help;;
\?) echo "Invalid option: -$OPTARG" >&2
exit -1
;;
esac
done
# Autofill if parameters are empty
if [ -z "$disk" ]
then disk=sda
fi
#Checks that only one query type is run
[[ `expr $io+$queue+$waittime` -ne "1" ]] && \
echo "ERROR: select one and only one run mode" && help
#set warning and critical to insane value is empty, else set the individual values
if [ -z "$warning" ]
then warning=99999
else
#TPS with IO, Request size with queue
warn_1=`echo $warning | cut -d, -f1`
#Read/s with IO,Queue Length with queue
warn_2=`echo $warning | cut -d, -f2`
#Write/s with IO
warn_3=`echo $warning | cut -d, -f3`
#KB/s read with IO
warn_4=`echo $warning | cut -d, -f4`
#KB/s written with IO
warn_5=`echo $warning | cut -d, -f5`
#Crude hack due to integer expression later in the script
warning=1
fi
if [ -z "$critical" ]
then critical=99999
else
#TPS with IO, Request size with queue
crit_1=`echo $critical | cut -d, -f1`
#Read/s with IO,Queue Length with queue
crit_2=`echo $critical | cut -d, -f2`
#Write/s with IO
crit_3=`echo $critical | cut -d, -f3`
#KB/s read with IO
crit_4=`echo $critical | cut -d, -f4`
#KB/s written with IO
crit_5=`echo $critical | cut -d, -f5`
#Crude hack due to integer expression later in the script
critical=1
fi
#------------Argument Set End-------------
#------------Parameter Check-------------
#Checks for sane Disk name:
[ ! -b "/dev/$disk" ] && echo "ERROR: Device incorrectly specified" && help
#Checks for sane warning/critical levels
if ( [[ $warning -ne "99999" ]] || [[ $critical -ne "99999" ]] ); then
if ( [[ "$warn_1" -gt "$crit_1" ]] || [[ "$warn_2" -gt "$crit_2" ]] ); then
echo "ERROR: critical levels must be higher than warning levels" && help
elif ( [[ $io -eq "1" ]] || [[ $waittime -eq "1" ]] ); then
if ( [[ "$warn_3" -gt "$crit_3" ]] || [[ "$warn_4" -gt "$crit_4" ]] || [[ "$warn_5" -gt "$crit_5" ]] ); then
echo "ERROR: critical levels must be higher than warning levels" && help
fi
fi
fi
#------------Parameter Check End-------------
# iostat parameters:
# -m: megabytes
# -k: kilobytes
# first run of iostat shows statistics since last reboot, second one shows current vaules of hdd
# -d is the duration for second run, -x the rest
TMPX=`$iostat $disk -x -k -d 10 $samples | grep $disk | tail -1`
#------------IO Test-------------
if [ "$io" == "1" ]; then
TMPD=`$iostat $disk -k -d 10 $samples | grep $disk | tail -1`
#Requests per second:
tps=`echo "$TMPD" | awk '{print $2}'`
read_sec=`echo "$TMPX" | awk '{print $4}'`
written_sec=`echo "$TMPX" | awk '{print $5}'`
#Kb per second:
kbytes_read_sec=`echo "$TMPX" | awk '{print $6}'`
kbytes_written_sec=`echo "$TMPX" | awk '{print $7}'`
# "Converting" values to float (string replace , with .)
tps=${tps/,/.}
read_sec=${read_sec/,/.}
written_sec=${written_sec/,/.}
kbytes_read_sec=${kbytes_read_sec/,/.}
kbytes_written_sec=${kbytes_written_sec/,/.}
# Comparing the result and setting the correct level:
if [ "$warning" -ne "99999" ]; then
if ( [ "`echo "$tps >= $warn_1" | bc`" == "1" ] || [ "`echo "$read_sec >= $warn_2" | bc`" == "1" ] || \
[ "`echo "$written_sec >= $warn_3" | bc`" == "1" ] || [ "`echo "$kbytes_read_sec >= $warn_4" | bc -q`" == "1" ] ||
[ "`echo "$kbytes_written_sec >= $warn_5" | bc`" == "1" ] ); then
STATE="WARNING"
status=1
fi
fi
if [ "$critical" -ne "99999" ]; then
if ( [ "`echo "$tps >= $crit_1" | bc`" == "1" ] || [ "`echo "$read_sec >= $crit_2" | bc -q`" == "1" ] || \
[ "`echo "$written_sec >= $crit_3" | bc`" == "1" ] || [ "`echo "$kbytes_read_sec >= $crit_4" | bc -q`" == "1" ] || \
[ "`echo "$kbytes_written_sec >= $crit_5" | bc`" == "1" ] ); then
STATE="CRITICAL"
status=2
fi
fi
# Printing the results:
MSG="$STATE - I/O stats: Transfers/Sec=$tps Read Requests/Sec=$read_sec Write Requests/Sec=$written_sec KBytes Read/Sec=$kbytes_read_sec KBytes_Written/Sec=$kbytes_written_sec"
PERFDATA=" | total_io_sec'=$tps; read_io_sec=$read_sec; write_io_sec=$written_sec; kbytes_read_sec=$kbytes_read_sec; kbytes_written_sec=$kbytes_written_sec;"
fi
#------------IO Test End-------------
#------------Queue Test-------------
if [ "$queue" == "1" ]; then
qsize=`echo "$TMPX" | awk '{print $8}'`
qlength=`echo "$TMPX" | awk '{print $9}'`
# "Converting" values to float (string replace , with .)
qsize=${qsize/,/.}
qlength=${qlength/,/.}
# Comparing the result and setting the correct level:
if [ "$warning" -ne "99999" ]; then
if ( [ "`echo "$qsize >= $warn_1" | bc`" == "1" ] || [ "`echo "$qlength >= $warn_2" | bc`" == "1" ] ); then
STATE="WARNING"
status=1
fi
fi
if [ "$critical" -ne "99999" ]; then
if ( [ "`echo "$qsize >= $crit_1" | bc`" == "1" ] || [ "`echo "$qlength >= $crit_2" | bc`" == "1" ] ); then
STATE="CRITICAL"
status=2
fi
fi
# Printing the results:
MSG="$STATE - Disk Queue Stats: Average Request Size=$qsize Average Queue Length=$qlength"
PERFDATA=" | qsize=$qsize; queue_length=$qlength;"
fi
#------------Queue Test End-------------
#------------Wait Time Test-------------
#Parse values. Warning - svc time will soon be deprecated and these will need to be changed. Future parser could look at first line (labels) to suggest correct column to return
if [ "$waittime" == "1" ]; then
avgwait=`echo "$TMPX" | awk '{print $10}'`
avgrwait=`echo "$TMPX" | awk '{print $11}'`
avgwwait=`echo "$TMPX" | awk '{print $12}'`
avgsvctime=`echo "$TMPX" | awk '{print $13}'`
avgcpuutil=`echo "$TMPX" | awk '{print $14}'`
# "Converting" values to float (string replace , with .)
avgwait=${avgwait/,/.}
avgrwait=${avgrwait/,/.}
avgwwait=${avgwwait/,/.}
avgsvctime=${avgsvctime/,/.}
avgcpuutil=${avgcpuutil/,/.}
# Comparing the result and setting the correct level:
if [ "$warning" -ne "99999" ]; then
if ( [ "`echo "$avgwait >= $warn_1" | bc`" == "1" ] || [ "`echo "$avgrwait >= $warn_2" | bc -q`" == "1" ] || \
[ "`echo "$avgwwait >= $warn_3" | bc`" == "1" ] || [ "`echo "$avgsvctime >= $warn_4" | bc -q`" == "1" ] || \
[ "`echo "$avgcpuutil >= $warn_5" | bc`" == "1" ] ); then
STATE="WARNING"
status=1
fi
fi
if [ "$critical" -ne "99999" ]; then
if ( [ "`echo "$avgwait >= $crit_1" | bc`" == "1" ] || [ "`echo "$avgrwait >= $crit_2" | bc -q`" == "1" ] || \
[ "`echo "$avgwwait >= $crit_3" | bc`" == "1" ] || [ "`echo "$avgsvctime >= $crit_4" | bc -q`" == "1" ] || \
[ "`echo "$avgcpuutil >= $crit_5" | bc`" == "1" ] ); then
STATE="CRITICAL"
status=2
fi
fi
# Printing the results:
MSG="$STATE - Wait Time Stats: Avg I/O Wait Time (ms)=$avgwait Avg Read Wait Time (ms)=$avgrwait Avg Write Wait Time (ms)=$avgwwait Avg Service Wait Time (ms)=$avgsvctime Avg CPU Utilization=$avgcpuutil"
PERFDATA=" | avg_io_waittime_ms=$avgwait; avg_r_waittime_ms=$avgrwait; avg_w_waittime_ms=$avgwwait; avg_service_waittime_ms=$avgsvctime; avg_cpu_utilization=$avgcpuutil;"
fi
#------------Wait Time End-------------
# now output the official result
echo -n "$MSG"
if [ "x$printperfdata" == "x1" ]; then echo -n "$PERFDATA"; fi
echo ""
exit $status
#----------/check_iostat.sh-----------