Page MenuHomePhabricator
Authored By
fgiunchedi
Jul 21 2015, 3:57 PM
Size
15 KB
Referenced Files
None
Subscribers
None

check-raid.py

#!/usr/bin/python
import os.path
import re
import subprocess
import sys
def main():
"""detect controllers, run relevant checks, collect reports
"""
status = []
report = []
# each check function returns it's most dire status code
# and a text string to append to the final report
for u in getLinuxUtility():
r = None
s = None
if u == 'arcconf':
s, r = checkAdaptec()
elif u == 'tw_cli':
s, r = check3ware()
elif u == 'MegaRAID_SAS':
s, r = checkMegaSAS()
elif u == 'FusionMPT':
s, r = checkFusionMPT()
elif u == 'HPSA':
s, r = checkHPSA()
elif u == 'mdadm':
s, r = checkLinuxRAID()
else:
s = 0
r = '%s found but unsupported' % u
report.append(r)
status.append(s)
if not status:
report.append('no RAID found')
status.append(0)
# translate nagios status codes into plain English, join
# per-controller text reports into one long string print the
# result to stdout
exit_status = mostDireStatus(status)
status_string = {
3: 'UNKNOWN',
2: 'CRITICAL',
1: 'WARNING',
0: 'OK',
}
print '%s: %s' % (status_string[exit_status], ', '.join(report))
sys.exit(exit_status)
def mostDireStatus(status_codes=[]):
"""find and most interesting status code from a list
:critical > warning > unknown > ok
:returns: integer from 0-3
"""
if 2 in status_codes:
return 2
elif 1 in status_codes:
return 1
elif 3 in status_codes:
return 3
else:
return 0
def shellCall(command=[]):
"""execute shell call, collect and cache stderr/stdout
:returns: stdoutdata/stderrdata as text strings
"""
try:
proc = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
# collect and cache stdout and stderr
stdoutdata, stderrdata = proc.communicate()
# report any nonzero exit status with informative
# stderr or stdout info
if proc.returncode:
msg = 'return code = %s' % proc.returncode
if stderrdata:
msg = stderrdata
elif stdoutdata:
msg = stdoutdata
msg = re.sub('\s+', ' ', msg).strip()
msg = (msg[:200] + '...') if len(msg) > 200 else msg
raise Exception(msg)
return (stdoutdata, stderrdata)
except:
raise Exception(str(sys.exc_info()[1]))
def getLinuxUtility():
"""check proc and mdadm for RAID devices/configuration
:returns: RAID devices as a list
"""
utility = []
# devices that implement block devices
f = open('/proc/devices', 'r')
regex = re.compile('^\s*\d+\s+(\w+)')
for line in f:
m = regex.match(line)
if m is None:
continue
if m.group(1) == 'aac':
utility.append('arcconf')
break
elif m.group(1) == 'twe':
utility.append('tw_cli')
break
elif m.group(1) == 'megadev':
utility.append('megarc')
break
elif m.group(1) == 'megaraid_sas_ioctl':
utility.append('MegaRAID_SAS')
break
f.close()
# devices that do not implement block devices
f = open('/proc/modules', 'r')
regex = re.compile('^(\w+)\s\d+')
for line in f:
m = regex.match(line)
if m is None:
continue
if m.group(1) == 'hpsa':
utility.append('HPSA')
break
elif m.group(1) == 'mptbase':
utility.append('FusionMPT')
break
f.close()
# check for software RAID
if getLinuxRAIDDevices():
utility.append('mdadm')
return utility
def checkFusionMPT():
report = []
status = []
try:
stdoutdata, stderrdata = shellCall(
['/usr/sbin/mpt-status', '--autoload', '--status_only'],
)
except:
report.append(str(sys.exc_info()[1]))
status.append(3)
else:
logicalDriveRx = re.compile('log_id (\d+) (\w+)')
physicalDriveRx = re.compile('phys_id (\d+) (\w+)')
for line in stdoutdata.splitlines():
l = logicalDriveRx.match(line)
if l is not None:
report.append('log_%s: %s' % (l.group(1), l.group(2)))
if l.group(2) == 'OPTIMAL':
status.append(0)
else:
status.append(2)
p = physicalDriveRx.match(line)
if p is not None:
report.append('phy_%s: %s' % (p.group(1), p.group(2)))
if p.group(2) == 'ONLINE':
status.append(0)
else:
status.append(2)
return (mostDireStatus(status), 'FusionMPT %s' % '; '.join(report))
def checkHPSA():
report = []
status = [0]
data = {}
logicalDriveRx = re.compile(
'\s+logicaldrive\s+(\d+)\s+\((.*),\s*([^,]+)\)$'
)
physicalDriveRx = re.compile(
'\s+physicaldrive\s+(\S+)\s+\([^,]+,[^,]+,[^,]+,\s*([^,)]+)'
)
controllerRx = re.compile('Smart Array (\w+) in Slot (\d+)')
controllerStatusRx = re.compile('^\s+Controller Status:\s+(\w+)')
rebuildRx = re.compile('\d+% complete$')
executable = None
if os.path.exists('/usr/sbin/hpssacli'):
executable = '/usr/sbin/hpssacli'
elif os.path.exists('/opt/usr/sbin/hpssacli'):
executable = '/opt/usr/sbin/hpssacli'
if executable is None:
return (3, 'HPSA no hpssacli executable found')
# get controller status
try:
stdoutdata, stderrdata = shellCall(
[executable, 'ctrl', 'all', 'show', 'detail']
)
except:
status.append(3)
report.append('hpssacli error: %s' % str(sys.exc_info()[1]))
else:
controller = 'unknown'
for line in stdoutdata.splitlines():
c = controllerRx.match(line)
if c is not None:
controller = '%s/slot%s' % (c.group(1), c.group(2))
continue
d = controllerStatusRx.match(line)
if d is not None:
data[controller] = [d.group(1)]
if d.group(1) != 'OK':
status.append(2)
# get logical and physical drive status
try:
stdoutdata, stderrdata = shellCall(
[executable, 'ctrl', 'all', 'show', 'config']
)
except:
status.append(3)
report.append('hpssacli error: %s' % str(sys.exc_info()[1]))
else:
controller = 'unknown'
for line in stdoutdata.splitlines():
c = controllerRx.match(line)
if c is not None:
controller = '%s/slot%s' % (c.group(1), c.group(2))
if controller not in data:
data[controller] = ['???']
continue
l = logicalDriveRx.match(line)
if l is not None:
ldinfo = l.group(2).replace(' ', '')
data[controller].append(
'log_%s: %s %s' % (l.group(1), ldinfo, l.group(3))
)
if rebuildRx.match(l.group(3)):
status.append(1)
elif l.group(3) != 'OK':
status.append(2)
continue
p = physicalDriveRx.match(line)
if p is not None:
if p.group(2) != 'OK':
data[controller].append(
'phy_%s: %s' % (p.group(1), p.group(2))
)
if p.group(2) == 'Rebuilding':
status.append(1)
else:
status.append(2)
for controller in data.keys():
report.append(
'[%s: %s]' % (controller, ', '.join(data[controller]))
)
return (mostDireStatus(status), 'HPSA %s' % '; '.join(report))
def checkAdaptec():
"""this is probably broken, need to find a host to test
:returns: probable fail
"""
report = []
status = []
# change directory so that the log file goes to the right place
oldDir = os.getcwd()
os.chdir('/var/log')
try:
stdoutdata, stderrdata = shellCall(
['/usr/bin/arcconf', 'getconfig', '1'],
)
except:
status.append(3)
report.append(str(sys.exc_info()[1]))
else:
defunctRx = re.compile(
'^\s*Defunct disk drive count\s*:\s*(\d+)'
)
logicalRx = re.compile(
'^\s*Logical devices/Failed/Degraded\s*:\s*(\d+)/(\d+)/(\d+)'
)
for line in stdoutdata.splitlines():
m = defunctRx.match(line)
if m is not None and m.group(1) is not '0':
report = 'Defunct disk drive count: ' + m.group(1)
status.append(2)
break
m = logicalRx.match(line)
if m is not None:
if m.group(2) is not '0' and m.group(3) is not '0':
report.append(
'logical devices: %s failed and %s defunct' %
(m.group(2), m.group(3))
)
status.append(2)
break
if m.group(2) is not '0':
report.append('logical devices: %s failed' % (m.group(2)))
status.append(2)
break
if m.group(3) is not '0':
report.append('logical devices: %s defunct' % (m.group(3)))
status.append(2)
break
os.chdir(oldDir)
return (mostDireStatus(status), 'Adaptec %s' % '; '.join(report))
def check3ware():
"""this is probably broken, need to find a host to test
:returns: probable fail
"""
# Get the list of controllers
try:
stdoutdata, stderrdata = shellCall(
['/usr/bin/tw_cli', 'show'],
)
except:
return (1, '3ware tw_cli error %s' % str(sys.exc_info()[1]))
regex = re.compile('^(c\d+)')
controllers = []
for line in stdoutdata.splitlines():
m = regex.match(line)
if m is not None:
controllers.push('/' + m.group(1))
# Check each controller
regex = re.compile('^(p\d+)\s+([\w-]+)')
failedDrives = []
numDrives = 0
stdoutdata = []
stderrdata = []
for controller in controllers:
try:
stdoutdata, stderrdata = shellCall(
['/usr/bin/tw_cli', controller, 'show'],
)
except:
return (1, '3ware tw_cli error %s' % str(sys.exc_info()[1]))
else:
for line in stdoutdata.splitlines():
m = regex.match(line)
if m is not None:
numDrives += 1
if m.group(2) != 'OK':
failedDrives.push(controller + '/' + m.group(1))
if len(failedDrives) is not 0:
return (
2, '%d failed drive(s): %s' %
(len(failedDrives), ', '.join(failedDrives))
)
if numDrives == 0:
return (1, 'no physical drives found, tw_cli parse error?')
else:
return (0, '%d drives checked' % numDrives)
def checkMegaSAS():
report = []
status = [0]
# change directory so that the log file goes to the right place
oldDir = os.getcwd()
os.chdir('/var/log')
try:
executable = None
if os.path.exists('/opt/MegaRAID/MegaCli/MegaCli64'):
executable = '/opt/MegaRAID/MegaCli/MegaCli64'
elif os.path.exists('/usr/sbin/megacli'):
executable = '/usr/sbin/megacli'
if executable is not None:
stdoutdata, stderrdata = shellCall(
[executable, '-LDInfo', '-LALL', '-aALL', '-NoLog'],
)
else:
raise Exception('no megacli executable found')
except:
report.append('error executing megacli: %s' % str(sys.exc_info()[1]))
status.append(3)
else:
adapterRx = re.compile('^Adapter\s+(\d+)\s*:?')
logicalRx = re.compile('^Virtual Drive:\s+(\d+)\s+')
stateRx = re.compile('^State\s*:\s*([^\n]*)')
driveRx = re.compile('^Number Of Drives\s*(per span)?\s*:\s+(\d+)')
state = {}
physicalCount = {}
adCount = 0
adapter = None
logical = None
for line in stdoutdata.splitlines():
a = adapterRx.match(line)
if a is not None:
adapter = a.group(1)
adCount += 1
continue
l = logicalRx.match(line)
if l is not None:
logical = 'a%s/v%s' % (adapter, l.group(1))
continue
s = stateRx.match(line)
if s is not None:
state[logical] = s.group(1).lower()
if s.group(1) == 'Optimal':
status.append(0)
else:
status.append(2)
continue
d = driveRx.match(line)
if d is not None:
physicalCount[logical] = int(d.group(2))
if adCount == 0:
report.append('no adapters found!')
state.append(3)
else:
report.append(
'%s logical, %s physical' %
(len(state), sum(physicalCount.values()))
)
for logical in state.keys():
if state[logical] != 'optimal':
report.append(
'%s (%s disk array) %s' %
(logical, physicalCount[logical], state[logical])
)
os.chdir(oldDir)
return (mostDireStatus(status), 'MegaSAS %s' % '; '.join(report))
def getLinuxRAIDDevices():
"""query mdadm for configured RAID arrays
:returns: arrays as a list
"""
try:
stdoutdata, stderrdata = shellCall(
['/sbin/mdadm', '--detail', '--scan']
)
except:
return []
else:
devices = []
regex = re.compile('^ARRAY\s+([^ ]*) ')
for line in stdoutdata.splitlines():
m = regex.match(line)
if m is not None:
devices.append(m.group(1))
return devices
def checkLinuxRAID():
report = []
status = []
data = {}
devices = getLinuxRAIDDevices()
args = ['/sbin/mdadm', '--detail']
args.extend(devices)
try:
stdoutdata, stderrdata = shellCall(args)
except:
return (
3, 'LinuxRAID: error executing mdadm: %s' %
str(sys.exc_info()[1])
)
deviceRx = re.compile('^(/[^ ]*):$')
statRx = re.compile(
'^ *(Active|Working|Failed|Spare) Devices *: *(\d+)'
)
logical = None
for line in stdoutdata.splitlines():
d = deviceRx.match(line)
if d is not None:
logical = d.group(1)
data[logical] = {}
continue
s = statRx.match(line)
if s is not None:
data[logical][s.group(1)] = int(s.group(2))
continue
for l in data.keys():
report.append(
'%s: act=%s, wk=%s, fail=%s, sp=%s' %
(l, data[l]['Active'], data[l]['Working'],
data[l]['Failed'], data[l]['Spare'])
)
if data[l]['Failed'] > 0:
status.append(2)
else:
status.append(0)
return (mostDireStatus(status), 'LinuxRAID %s' % '; '.join(report))
main()

File Metadata

Mime Type
text/x-python
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
194308
Default Alt Text
check-raid.py (15 KB)

Event Timeline