Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F203780
check-raid.py
fgiunchedi (Filippo Giunchedi)
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
fgiunchedi
Jul 21 2015, 3:57 PM
2015-07-21 15:57:57 (UTC+0)
Size
15 KB
Referenced Files
None
Subscribers
None
check-raid.py
View Options
#!/usr/bin/python
import
os.path
import
re
import
subprocess
import
sys
def
main
():
"""detect controllers, run relevant checks, collect reports
"""
status
=
[]
report
=
[]
# each check function returns it's most dire status code
# and a text string to append to the final report
for
u
in
getLinuxUtility
():
r
=
None
s
=
None
if
u
==
'arcconf'
:
s
,
r
=
checkAdaptec
()
elif
u
==
'tw_cli'
:
s
,
r
=
check3ware
()
elif
u
==
'MegaRAID_SAS'
:
s
,
r
=
checkMegaSAS
()
elif
u
==
'FusionMPT'
:
s
,
r
=
checkFusionMPT
()
elif
u
==
'HPSA'
:
s
,
r
=
checkHPSA
()
elif
u
==
'mdadm'
:
s
,
r
=
checkLinuxRAID
()
else
:
s
=
0
r
=
'
%s
found but unsupported'
%
u
report
.
append
(
r
)
status
.
append
(
s
)
if
not
status
:
report
.
append
(
'no RAID found'
)
status
.
append
(
0
)
# translate nagios status codes into plain English, join
# per-controller text reports into one long string print the
# result to stdout
exit_status
=
mostDireStatus
(
status
)
status_string
=
{
3
:
'UNKNOWN'
,
2
:
'CRITICAL'
,
1
:
'WARNING'
,
0
:
'OK'
,
}
print
'
%s
:
%s
'
%
(
status_string
[
exit_status
],
', '
.
join
(
report
))
sys
.
exit
(
exit_status
)
def
mostDireStatus
(
status_codes
=
[]):
"""find and most interesting status code from a list
:critical > warning > unknown > ok
:returns: integer from 0-3
"""
if
2
in
status_codes
:
return
2
elif
1
in
status_codes
:
return
1
elif
3
in
status_codes
:
return
3
else
:
return
0
def
shellCall
(
command
=
[]):
"""execute shell call, collect and cache stderr/stdout
:returns: stdoutdata/stderrdata as text strings
"""
try
:
proc
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
# collect and cache stdout and stderr
stdoutdata
,
stderrdata
=
proc
.
communicate
()
# report any nonzero exit status with informative
# stderr or stdout info
if
proc
.
returncode
:
msg
=
'return code =
%s
'
%
proc
.
returncode
if
stderrdata
:
msg
=
stderrdata
elif
stdoutdata
:
msg
=
stdoutdata
msg
=
re
.
sub
(
'\s+'
,
' '
,
msg
)
.
strip
()
msg
=
(
msg
[:
200
]
+
'...'
)
if
len
(
msg
)
>
200
else
msg
raise
Exception
(
msg
)
return
(
stdoutdata
,
stderrdata
)
except
:
raise
Exception
(
str
(
sys
.
exc_info
()[
1
]))
def
getLinuxUtility
():
"""check proc and mdadm for RAID devices/configuration
:returns: RAID devices as a list
"""
utility
=
[]
# devices that implement block devices
f
=
open
(
'/proc/devices'
,
'r'
)
regex
=
re
.
compile
(
'^\s*\d+\s+(\w+)'
)
for
line
in
f
:
m
=
regex
.
match
(
line
)
if
m
is
None
:
continue
if
m
.
group
(
1
)
==
'aac'
:
utility
.
append
(
'arcconf'
)
break
elif
m
.
group
(
1
)
==
'twe'
:
utility
.
append
(
'tw_cli'
)
break
elif
m
.
group
(
1
)
==
'megadev'
:
utility
.
append
(
'megarc'
)
break
elif
m
.
group
(
1
)
==
'megaraid_sas_ioctl'
:
utility
.
append
(
'MegaRAID_SAS'
)
break
f
.
close
()
# devices that do not implement block devices
f
=
open
(
'/proc/modules'
,
'r'
)
regex
=
re
.
compile
(
'^(\w+)\s\d+'
)
for
line
in
f
:
m
=
regex
.
match
(
line
)
if
m
is
None
:
continue
if
m
.
group
(
1
)
==
'hpsa'
:
utility
.
append
(
'HPSA'
)
break
elif
m
.
group
(
1
)
==
'mptbase'
:
utility
.
append
(
'FusionMPT'
)
break
f
.
close
()
# check for software RAID
if
getLinuxRAIDDevices
():
utility
.
append
(
'mdadm'
)
return
utility
def
checkFusionMPT
():
report
=
[]
status
=
[]
try
:
stdoutdata
,
stderrdata
=
shellCall
(
[
'/usr/sbin/mpt-status'
,
'--autoload'
,
'--status_only'
],
)
except
:
report
.
append
(
str
(
sys
.
exc_info
()[
1
]))
status
.
append
(
3
)
else
:
logicalDriveRx
=
re
.
compile
(
'log_id (\d+) (\w+)'
)
physicalDriveRx
=
re
.
compile
(
'phys_id (\d+) (\w+)'
)
for
line
in
stdoutdata
.
splitlines
():
l
=
logicalDriveRx
.
match
(
line
)
if
l
is
not
None
:
report
.
append
(
'log_
%s
:
%s
'
%
(
l
.
group
(
1
),
l
.
group
(
2
)))
if
l
.
group
(
2
)
==
'OPTIMAL'
:
status
.
append
(
0
)
else
:
status
.
append
(
2
)
p
=
physicalDriveRx
.
match
(
line
)
if
p
is
not
None
:
report
.
append
(
'phy_
%s
:
%s
'
%
(
p
.
group
(
1
),
p
.
group
(
2
)))
if
p
.
group
(
2
)
==
'ONLINE'
:
status
.
append
(
0
)
else
:
status
.
append
(
2
)
return
(
mostDireStatus
(
status
),
'FusionMPT
%s
'
%
'; '
.
join
(
report
))
def
checkHPSA
():
report
=
[]
status
=
[
0
]
data
=
{}
logicalDriveRx
=
re
.
compile
(
'\s+logicaldrive\s+(\d+)\s+\((.*),\s*([^,]+)\)$'
)
physicalDriveRx
=
re
.
compile
(
'\s+physicaldrive\s+(\S+)\s+\([^,]+,[^,]+,[^,]+,\s*([^,)]+)'
)
controllerRx
=
re
.
compile
(
'Smart Array (\w+) in Slot (\d+)'
)
controllerStatusRx
=
re
.
compile
(
'^\s+Controller Status:\s+(\w+)'
)
rebuildRx
=
re
.
compile
(
'\d+
% c
omplete$'
)
executable
=
None
if
os
.
path
.
exists
(
'/usr/sbin/hpssacli'
):
executable
=
'/usr/sbin/hpssacli'
elif
os
.
path
.
exists
(
'/opt/usr/sbin/hpssacli'
):
executable
=
'/opt/usr/sbin/hpssacli'
if
executable
is
None
:
return
(
3
,
'HPSA no hpssacli executable found'
)
# get controller status
try
:
stdoutdata
,
stderrdata
=
shellCall
(
[
executable
,
'ctrl'
,
'all'
,
'show'
,
'detail'
]
)
except
:
status
.
append
(
3
)
report
.
append
(
'hpssacli error:
%s
'
%
str
(
sys
.
exc_info
()[
1
]))
else
:
controller
=
'unknown'
for
line
in
stdoutdata
.
splitlines
():
c
=
controllerRx
.
match
(
line
)
if
c
is
not
None
:
controller
=
'
%s
/slot
%s
'
%
(
c
.
group
(
1
),
c
.
group
(
2
))
continue
d
=
controllerStatusRx
.
match
(
line
)
if
d
is
not
None
:
data
[
controller
]
=
[
d
.
group
(
1
)]
if
d
.
group
(
1
)
!=
'OK'
:
status
.
append
(
2
)
# get logical and physical drive status
try
:
stdoutdata
,
stderrdata
=
shellCall
(
[
executable
,
'ctrl'
,
'all'
,
'show'
,
'config'
]
)
except
:
status
.
append
(
3
)
report
.
append
(
'hpssacli error:
%s
'
%
str
(
sys
.
exc_info
()[
1
]))
else
:
controller
=
'unknown'
for
line
in
stdoutdata
.
splitlines
():
c
=
controllerRx
.
match
(
line
)
if
c
is
not
None
:
controller
=
'
%s
/slot
%s
'
%
(
c
.
group
(
1
),
c
.
group
(
2
))
if
controller
not
in
data
:
data
[
controller
]
=
[
'???'
]
continue
l
=
logicalDriveRx
.
match
(
line
)
if
l
is
not
None
:
ldinfo
=
l
.
group
(
2
)
.
replace
(
' '
,
''
)
data
[
controller
]
.
append
(
'log_
%s
:
%s
%s
'
%
(
l
.
group
(
1
),
ldinfo
,
l
.
group
(
3
))
)
if
rebuildRx
.
match
(
l
.
group
(
3
)):
status
.
append
(
1
)
elif
l
.
group
(
3
)
!=
'OK'
:
status
.
append
(
2
)
continue
p
=
physicalDriveRx
.
match
(
line
)
if
p
is
not
None
:
if
p
.
group
(
2
)
!=
'OK'
:
data
[
controller
]
.
append
(
'phy_
%s
:
%s
'
%
(
p
.
group
(
1
),
p
.
group
(
2
))
)
if
p
.
group
(
2
)
==
'Rebuilding'
:
status
.
append
(
1
)
else
:
status
.
append
(
2
)
for
controller
in
data
.
keys
():
report
.
append
(
'[
%s
:
%s
]'
%
(
controller
,
', '
.
join
(
data
[
controller
]))
)
return
(
mostDireStatus
(
status
),
'HPSA
%s
'
%
'; '
.
join
(
report
))
def
checkAdaptec
():
"""this is probably broken, need to find a host to test
:returns: probable fail
"""
report
=
[]
status
=
[]
# change directory so that the log file goes to the right place
oldDir
=
os
.
getcwd
()
os
.
chdir
(
'/var/log'
)
try
:
stdoutdata
,
stderrdata
=
shellCall
(
[
'/usr/bin/arcconf'
,
'getconfig'
,
'1'
],
)
except
:
status
.
append
(
3
)
report
.
append
(
str
(
sys
.
exc_info
()[
1
]))
else
:
defunctRx
=
re
.
compile
(
'^\s*Defunct disk drive count\s*:\s*(\d+)'
)
logicalRx
=
re
.
compile
(
'^\s*Logical devices/Failed/Degraded\s*:\s*(\d+)/(\d+)/(\d+)'
)
for
line
in
stdoutdata
.
splitlines
():
m
=
defunctRx
.
match
(
line
)
if
m
is
not
None
and
m
.
group
(
1
)
is
not
'0'
:
report
=
'Defunct disk drive count: '
+
m
.
group
(
1
)
status
.
append
(
2
)
break
m
=
logicalRx
.
match
(
line
)
if
m
is
not
None
:
if
m
.
group
(
2
)
is
not
'0'
and
m
.
group
(
3
)
is
not
'0'
:
report
.
append
(
'logical devices:
%s
failed and
%s
defunct'
%
(
m
.
group
(
2
),
m
.
group
(
3
))
)
status
.
append
(
2
)
break
if
m
.
group
(
2
)
is
not
'0'
:
report
.
append
(
'logical devices:
%s
failed'
%
(
m
.
group
(
2
)))
status
.
append
(
2
)
break
if
m
.
group
(
3
)
is
not
'0'
:
report
.
append
(
'logical devices:
%s
defunct'
%
(
m
.
group
(
3
)))
status
.
append
(
2
)
break
os
.
chdir
(
oldDir
)
return
(
mostDireStatus
(
status
),
'Adaptec
%s
'
%
'; '
.
join
(
report
))
def
check3ware
():
"""this is probably broken, need to find a host to test
:returns: probable fail
"""
# Get the list of controllers
try
:
stdoutdata
,
stderrdata
=
shellCall
(
[
'/usr/bin/tw_cli'
,
'show'
],
)
except
:
return
(
1
,
'3ware tw_cli error
%s
'
%
str
(
sys
.
exc_info
()[
1
]))
regex
=
re
.
compile
(
'^(c\d+)'
)
controllers
=
[]
for
line
in
stdoutdata
.
splitlines
():
m
=
regex
.
match
(
line
)
if
m
is
not
None
:
controllers
.
push
(
'/'
+
m
.
group
(
1
))
# Check each controller
regex
=
re
.
compile
(
'^(p\d+)\s+([\w-]+)'
)
failedDrives
=
[]
numDrives
=
0
stdoutdata
=
[]
stderrdata
=
[]
for
controller
in
controllers
:
try
:
stdoutdata
,
stderrdata
=
shellCall
(
[
'/usr/bin/tw_cli'
,
controller
,
'show'
],
)
except
:
return
(
1
,
'3ware tw_cli error
%s
'
%
str
(
sys
.
exc_info
()[
1
]))
else
:
for
line
in
stdoutdata
.
splitlines
():
m
=
regex
.
match
(
line
)
if
m
is
not
None
:
numDrives
+=
1
if
m
.
group
(
2
)
!=
'OK'
:
failedDrives
.
push
(
controller
+
'/'
+
m
.
group
(
1
))
if
len
(
failedDrives
)
is
not
0
:
return
(
2
,
'
%d
failed drive(s):
%s
'
%
(
len
(
failedDrives
),
', '
.
join
(
failedDrives
))
)
if
numDrives
==
0
:
return
(
1
,
'no physical drives found, tw_cli parse error?'
)
else
:
return
(
0
,
'
%d
drives checked'
%
numDrives
)
def
checkMegaSAS
():
report
=
[]
status
=
[
0
]
# change directory so that the log file goes to the right place
oldDir
=
os
.
getcwd
()
os
.
chdir
(
'/var/log'
)
try
:
executable
=
None
if
os
.
path
.
exists
(
'/opt/MegaRAID/MegaCli/MegaCli64'
):
executable
=
'/opt/MegaRAID/MegaCli/MegaCli64'
elif
os
.
path
.
exists
(
'/usr/sbin/megacli'
):
executable
=
'/usr/sbin/megacli'
if
executable
is
not
None
:
stdoutdata
,
stderrdata
=
shellCall
(
[
executable
,
'-LDInfo'
,
'-LALL'
,
'-aALL'
,
'-NoLog'
],
)
else
:
raise
Exception
(
'no megacli executable found'
)
except
:
report
.
append
(
'error executing megacli:
%s
'
%
str
(
sys
.
exc_info
()[
1
]))
status
.
append
(
3
)
else
:
adapterRx
=
re
.
compile
(
'^Adapter\s+(\d+)\s*:?'
)
logicalRx
=
re
.
compile
(
'^Virtual Drive:\s+(\d+)\s+'
)
stateRx
=
re
.
compile
(
'^State\s*:\s*([^
\n
]*)'
)
driveRx
=
re
.
compile
(
'^Number Of Drives\s*(per span)?\s*:\s+(\d+)'
)
state
=
{}
physicalCount
=
{}
adCount
=
0
adapter
=
None
logical
=
None
for
line
in
stdoutdata
.
splitlines
():
a
=
adapterRx
.
match
(
line
)
if
a
is
not
None
:
adapter
=
a
.
group
(
1
)
adCount
+=
1
continue
l
=
logicalRx
.
match
(
line
)
if
l
is
not
None
:
logical
=
'a
%s
/v
%s
'
%
(
adapter
,
l
.
group
(
1
))
continue
s
=
stateRx
.
match
(
line
)
if
s
is
not
None
:
state
[
logical
]
=
s
.
group
(
1
)
.
lower
()
if
s
.
group
(
1
)
==
'Optimal'
:
status
.
append
(
0
)
else
:
status
.
append
(
2
)
continue
d
=
driveRx
.
match
(
line
)
if
d
is
not
None
:
physicalCount
[
logical
]
=
int
(
d
.
group
(
2
))
if
adCount
==
0
:
report
.
append
(
'no adapters found!'
)
state
.
append
(
3
)
else
:
report
.
append
(
'
%s
logical,
%s
physical'
%
(
len
(
state
),
sum
(
physicalCount
.
values
()))
)
for
logical
in
state
.
keys
():
if
state
[
logical
]
!=
'optimal'
:
report
.
append
(
'
%s
(
%s
disk array)
%s
'
%
(
logical
,
physicalCount
[
logical
],
state
[
logical
])
)
os
.
chdir
(
oldDir
)
return
(
mostDireStatus
(
status
),
'MegaSAS
%s
'
%
'; '
.
join
(
report
))
def
getLinuxRAIDDevices
():
"""query mdadm for configured RAID arrays
:returns: arrays as a list
"""
try
:
stdoutdata
,
stderrdata
=
shellCall
(
[
'/sbin/mdadm'
,
'--detail'
,
'--scan'
]
)
except
:
return
[]
else
:
devices
=
[]
regex
=
re
.
compile
(
'^ARRAY\s+([^ ]*) '
)
for
line
in
stdoutdata
.
splitlines
():
m
=
regex
.
match
(
line
)
if
m
is
not
None
:
devices
.
append
(
m
.
group
(
1
))
return
devices
def
checkLinuxRAID
():
report
=
[]
status
=
[]
data
=
{}
devices
=
getLinuxRAIDDevices
()
args
=
[
'/sbin/mdadm'
,
'--detail'
]
args
.
extend
(
devices
)
try
:
stdoutdata
,
stderrdata
=
shellCall
(
args
)
except
:
return
(
3
,
'LinuxRAID: error executing mdadm:
%s
'
%
str
(
sys
.
exc_info
()[
1
])
)
deviceRx
=
re
.
compile
(
'^(/[^ ]*):$'
)
statRx
=
re
.
compile
(
'^ *(Active|Working|Failed|Spare) Devices *: *(\d+)'
)
logical
=
None
for
line
in
stdoutdata
.
splitlines
():
d
=
deviceRx
.
match
(
line
)
if
d
is
not
None
:
logical
=
d
.
group
(
1
)
data
[
logical
]
=
{}
continue
s
=
statRx
.
match
(
line
)
if
s
is
not
None
:
data
[
logical
][
s
.
group
(
1
)]
=
int
(
s
.
group
(
2
))
continue
for
l
in
data
.
keys
():
report
.
append
(
'
%s
: act=
%s
, wk=
%s
, fail=
%s
, sp=
%s
'
%
(
l
,
data
[
l
][
'Active'
],
data
[
l
][
'Working'
],
data
[
l
][
'Failed'
],
data
[
l
][
'Spare'
])
)
if
data
[
l
][
'Failed'
]
>
0
:
status
.
append
(
2
)
else
:
status
.
append
(
0
)
return
(
mostDireStatus
(
status
),
'LinuxRAID
%s
'
%
'; '
.
join
(
report
))
main
()
File Metadata
Details
Attached
Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
194308
Default Alt Text
check-raid.py (15 KB)
Attached To
Mode
T84050: Refactor RAID checks (check-raid)
Attached
Detach File
Event Timeline
Log In to Comment