Lazy File Manager
| Current Path : /lib64/nagios/plugins/ |
| PPPath//lib64/nagios/plugins |
| Current File : //lib64/nagios/plugins/check_a2_smart.py.all |
#!/usr/bin/python3
import subprocess
from shlex import split
import os
import glob
import re
import sys
import time
import socket
exit_code = 0 # Default OK
# Check if the hostname contains "vplatform" - BFENG-1670
hostname = socket.gethostname()
if "vplatform" in hostname:
print("Check_Smart_sda - this is vplatform server - OK")
exit(0)
# Global variables
WEAR_WARN_860_PRO_1TB = 5500
WEAR_CRIT_860_PRO_1TB = 5750
# Alert warning if single drive wear below this:
WEAR_WARN = 5
# Alert critical if single drive wear below this:
WEAR_CRIT = 2
# Alert warning if both drives in a pair below this:
PAIR_WEAR_WARN = 10
# Alert crtiical if both drives in a pair below this:
PAIR_WEAR_CRIT = 5
SSDREMAP = 400
REMAP_WARN = 5
REMAP_CRIT = 2
# Wait this many seconds before retrying a failed smartctl:
FAIL_DELAY = 2
disk_list = []
def shell(cmd, outform="short", input=""):
output, errors = subprocess.Popen(['bash','-c', cmd], stdout=subprocess.PIPE, stdin=subprocess.PIPE).communicate(input=input.encode())
if outform != "long":
retval = output.decode('utf-8').split('\n')[0]
else:
retval = output.decode('utf-8')
return retval
def get_smart_out(dev):
retries = 0
success = False
while success == False and retries < 5:
smart_out = str(shell(smart + ' -a ' + dev, outform="long"))
if "SMART Disabled" in smart_out:
time.sleep(FAIL_DELAY)
# and try again
elif "A mandatory SMART command failed" in smart_out:
time.sleep(FAIL_DELAY)
# and try again
elif "Read SMART Data failed" in smart_out:
time.sleep(FAIL_DELAY)
# and try again
else:
success = True
# escape the loop
retries += 1
# still no valid SMART output after 5 tries!
if success == False:
# returning an empty string will result in critical "unrecognized model" error
# this should get the drive the attention it needs if it cannot answer SMART queries at all
return ""
# We have valid SMART output
else:
return smart_out
# self-contained function to check the disk
# mode = sata or nvme
def disk_check(smart, file_name_pattern, mode):
global exit_code
for sdx in glob.iglob(file_name_pattern):
device = sdx.split("/")[3]
smart_out = get_smart_out('/dev/' + device)
if mode == 'sata':
model=str(shell('grep "Device Model" | cut -c 19-', input=smart_out))
protocol=str(shell('grep -w "Transport protocol:" | awk \'{ print $3 }\'', input=smart_out))
elif mode == 'nvme':
model=str(shell('grep "Model Number" | awk "{print $4}"', input=smart_out))
protocol='nvme'
else:
print("Check_Smart_ " + device + " - ERROR: Unrecognized device mode: " + mode + ".")
exit_code = max(exit_code, 2)
return
if protocol == 'SAS':
defectlist = int(shell('grep -w "Elements in grown defect list:" | awk \'{ print $6 }\'', input=smart_out))
read_error = int(shell('grep -w "read:" | awk \'{ print $8 }\'', input=smart_out))
write_error = int(shell('grep -w "write:" | awk \'{ print $8 }\'', input=smart_out))
verify_error = int(shell('grep -w "verify:" | awk \'{ print $8 }\'', input=smart_out))
if ((defectlist > 20) or (read_error > 20) or (write_error > 20) or (verify_error > 20)):
print("Check_Smart_" + device + " - CRITICAL - " + device + ", Elements in defect list=" + str(defectlist) + ", Uncorrected read errors=" + str(read_error)
+ ", Uncorrected write errors=" + str(write_error) + ", Uncorrected verify errors=" + str(verify_error))
exit_code = max(exit_code, 2)
else:
print("Check_Smart_" + device + " - OK")
exit_code = max(exit_code, 0)
# Spinning disks, which key on pending sectors and remap count.
seagate_hdd_models = [ 'ST1', 'ST3', 'ST4', 'ST5', 'ST8', 'ST9', 'ST2000' ]
if (model.startswith('WDC') or model.startswith('TOSHIBA') or model.startswith(tuple(seagate_hdd_models)) or model.startswith('GB1000')):
remap = int(shell("grep Reallocated_Sector_Ct | awk \'{print $10}\'", input=smart_out))
pend = int(shell("grep Current_Pending_Sector | awk \'{print $10}\'", input=smart_out))
hours = int(shell("grep Power_On_Hours | awk \'{print $10}\'", input=smart_out))
if (remap > 50) or (pend > 0):
print("Check_Smart_" + device + " - CRITICAL - " + device + " SMART failure Hours=" + str(
hours) + " Remap=" + str(remap) + " Pending=" + str(pend))
exit_code = max(exit_code, 2)
else:
print("Check_Smart_" + device + " - OK - " + device + " clean Hours=" + str(hours))
exit_code = max(exit_code, 0)
elif (model.startswith('KINGSTON')):
remap = int(shell("grep Retired_Block_Count | awk \'{print $10}\'", input=smart_out))
pend = int(shell("grep Reported_Uncorrect | awk \'{print $10}\'", input=smart_out))
hours = int(shell("grep Power_On_Hours | awk \'{print $10}\'", input=smart_out))
if (remap > 50) or (pend > 0):
print("Check_Smart_" + device + " - CRITICAL - " + device + " SMART failure Hours=" + str(
hours) + " Remap=" + str(remap) + " Pending=" + str(pend))
exit_code = max(exit_code, 2)
else:
print("Check_Smart_" + device + " - OK - " + device + " clean Hours=" + str(hours))
exit_code = max(exit_code, 0)
# Fetch NVMe data
elif mode == 'nvme':
# Normalize wear to mean life remaining, like is true for SATA
wear = 100 - int(
shell("grep 'Percentage Used' | awk '{print $3}' | cut -d '%' -f1", input=smart_out))
# No rsvd block count exposed for NVMe, so put a 0 which is always less than the threshold for SATA disks
entry = {'device': device, 'wear': wear, 'model': model, 'rsvd': 0}
disk_list.append(entry)
# SSD relying on raw data due to normalized smartctl output data being too conservative. Tests wear level and thus cares about raid locality
elif '860 PRO 1TB' in model:
wear = int(shell("grep Wear_Level | awk '{print $10}'", input=smart_out))
rsvd = int(shell("grep Used_Rsvd | awk '{print $10}'", input=smart_out))
# Normalize manually
wear = 100 - (wear / WEAR_CRIT_860_PRO_1TB)
entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
disk_list.append(entry)
# Other SSD models that have acceptable SMART values
elif ('SSD' in model and not model.startswith('INTEL SSD') or model.startswith('Kingston SKC') or model.startswith('SAMSUNG MZ7')):
wear = str(shell("grep Wear_Level | awk '{print $4}'", input=smart_out))
if wear.isdigit():
wear = int(wear)
else:
# check failed, return a deliberately out-of-bounds value
wear = 9000
rsvd = str(shell("grep Used_Rsvd | awk '{print $10}'", input=smart_out))
if rsvd.isdigit():
rsvd = int(rsvd)
else:
# check failed, return a deliberately out-of-bounds value
rsvd = 9000
entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
disk_list.append(entry)
elif('Micron_5300' in model or 'Micron_1100' in model):
wear = int(shell("grep Percent_Lifetime_Remain | awk '{print $4}'", input=smart_out))
rsvd = int(shell("grep -e Unused_Rsvd_Blk_Cnt_Tot -e Reallocate_NAND_Blk_Cnt | awk '{print $10}'", input=smart_out))
entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
disk_list.append(entry)
elif('INTEL' in model):
wear = int(shell("grep Media_Wearout_Indicator | awk '{print $4}'", input=smart_out))
rsvd = int(shell("grep Available_Reservd_Space | awk '{print $10}'", input=smart_out))
entry = {'device': device, 'model': model, 'wear': wear, 'rsvd': rsvd}
disk_list.append(entry)
# if protocol isnt SAS and no models are matched above, error
elif(protocol == ''):
print("Check_Smart_" + device + " - ERROR: Unrecognized model: " + model)
exit_code = max(exit_code, 2)
# end of for looping over the disks
# Fetch RAID info from mdadm about these devices and integrate with the smartctl data
populate_raid_info(disk_list)
# Iterate over each disk and mark it good or bad based on thresholds
for disk in disk_list:
# 0 = good, 1 = warn, 2+= crit
disk['status'] = 0
# Fail if too many remaps. The good/ok gets overwritten by wear leveling checks if needed
if ('Micron_5300' in disk['model']):
if disk['rsvd'] < REMAP_CRIT:
disk['status'] += 2
elif disk['rsvd'] < REMAP_WARN:
disk['status'] += 1
# End Micron5300 specific code
elif disk['rsvd'] > SSDREMAP:
disk['status'] += 1
# Fail independently if too much wear: permits a crit here to override a simple warn from remaps
# Wear values are 99 (Best) down to 0 (no predicted write life left), so <= is the proper check
if disk['wear'] == 9000:
# check for the out-of-bounds value that indicates check malfunctioned
disk['status'] = 9000
elif disk['wear'] <= WEAR_CRIT:
disk['status'] += 2
elif disk['wear'] <= WEAR_WARN:
disk['status'] += 1
if disk['status'] == 0:
disk['warn_type'] = "OK"
elif disk['status'] == 9000:
disk['warn_type'] = "UNKNOWN"
elif disk['status'] == 1:
disk['warn_type'] = "WARNING"
else:
disk['warn_type'] = "CRITICAL"
# Now that health data on all disks are populated, run through each disk again and determine
# whether to alert it as good or bad.
for disk in disk_list:
# report individual disk health
if disk['status'] == 9000:
# something in check went wrong
output="Check_Smart_" + disk['device'] + " wear_life_remaining=?;?;? remaining_life=?%" + \
" remaps=? " + disk['device'] + " UNKNOWN"
exit_code = max(exit_code, 3)
elif disk['status'] == 0:
output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
"%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " OK"
exit_code = max(exit_code, 0)
elif disk['status'] == 1:
output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
"%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " WARNING"
exit_code = max(exit_code, 1)
else:
output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
"%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " CRITICAL"
exit_code = max(exit_code, 2)
# now check status of pair partner
part = find_pair(disk, disk_list)
if disk['wear'] <= PAIR_WEAR_CRIT and part['wear'] <= PAIR_WEAR_CRIT:
# crit even if drives would be individually good or warn.
output = "Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + \
";" + str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + \
"%" + " remaps=" + str(disk['rsvd']) + " " + disk['device'] + " CRITICAL"
exit_code = max(exit_code, 2)
elif disk['status'] < 2 and disk['wear'] <= PAIR_WEAR_WARN and part['wear'] <= PAIR_WEAR_WARN:
# warn even if drives would be individually good (but don't downgrade from crit).
output="Check_Smart_" + disk['device'] + " wear_life_remaining=" + str(round(disk['wear'])) + ";" \
+ str(WEAR_WARN) + ";" + str(WEAR_CRIT) + " remaining_life=" + str(round(disk['wear'])) + "%" + \
" remaps=" + str(disk['rsvd']) + " " + disk['device'] + " WARNING"
exit_code = max(exit_code, 1)
print(output)
# Fetch the list of md arrays from the system and populates devices dictionary with them
# Finds the first raid10 device and uses it to determine which disks are in what sets.
# Area for future improvement: check all arrays instead of just the first, for sanity
# Also, it relies on adjacency to determine set info. In a 4x R10 there are two set-As
# and two set-Bs and it presumes that near=2 is the setting for deciding which to check.
def populate_raid_info(devices):
arrays = shell("mdadm --detail --scan")
for array in arrays.splitlines():
device = array.split(' ')[1]
raid_type = shell("mdadm --detail " + device + " | grep 'Raid Level' | awk '{print $4}'")
# Fetch detailed set information
for dev in devices:
raid_device = shell("mdadm --detail " + device + " | grep " + dev['device'] + " | awk '{print $4}'")
if raid_device != '':
dev['RaidDevice'] = int(raid_device)
set_info = shell("mdadm --detail " + device + " | grep " + dev['device'] + " | awk '{print $7}'")
dev['set'] = set_info
# Finds the R10 pair in a set
# Presumes near=2
def find_pair(disk, devices):
try:
set_name = disk['set']
raid_device = disk['RaidDevice']
# If even, pair is +1 id
if (raid_device % 2) == 0:
return fetch_disk_by_id(disk['RaidDevice'] + 1, devices)
else:
return fetch_disk_by_id(disk['RaidDevice'] - 1, devices)
except KeyError:
return None
def fetch_disk_by_id(id, devices):
for d in devices:
if d['RaidDevice'] == id:
return d
return []
## MAIN CODE
# Let's skip mvps
grains_role = shell("grep ^[[:space:]].role: /etc/salt/minion | awk '{print $2}'")
if grains_role == 'mvps':
exit()
# determine which disk type the machine uses
sdx = os.path.isfile("/sys/block/sda/size")
nvme_x = os.path.isfile("/sys/block/nvme0n1/size")
for x in range(1,6):
if os.path.isfile("/sys/block/nvme" + str(x) + "n1/size"):
nvme_x = os.path.isfile("/sys/block/nvme" + str(x) + "n1/size")
break
# Fail silently and early out of devices that lack both. These would be VMs with
# xvda and such, which ought to neither have SMARTmontools nor physical disks to check
if not sdx and not nvme_x:
exit()
# check for smartmontools
smart = shell('which smartctl')
if not smart:
print(smart)
print("Check_Smart_sda - ERROR: Unable to detect smartmontools. Is it installed?")
exit(2)
# execute appropriate check
if sdx and nvme_x:
disk_check(smart, '/sys/block/sd?', 'sata')
disk_check(smart, '/sys/block/nvme?n1/nvme?n?p1', 'nvme')
elif sdx:
disk_check(smart, '/sys/block/sd?', 'sata')
elif nvme_x:
disk_check(smart, '/sys/block/nvme?n1/nvme?n?p1', 'nvme')
# Exit with the highest severity discovered (0 OK, 1 WARNING, 2 CRITICAL, 3 UNKNOWN)
sys.exit(exit_code)
Da3s File Manager Version 1.0, Coded By Da3s HaCkEr
Email: R0@hotmail.com
