Monitor hard disk SMART status

I have been fighting for years with smartd but I really never managed to configure it the way I want.
While I certainly am not backblaze, I still have quite a few hard disks I would like to monitor and be able to replace before they actually die.
I hacked up a small Python script to query some SMART attributes and send me an email in case something funky is going on; to use it just put in /etc/cron.daily and tell cronie to run it.
To work correctly the script also needs smartctl and sendmail.

#!/usr/bin/env python2.7

import subprocess
import re
import socket
import os


hostname = socket.gethostname()
########################################################################
DOMAIN = "domain.tld"
TO = ['receiver']
HDD = ["sda", "sdb", "sdc", "sdd", "sde"]
########################################################################
FROM = hostname + "@" + DOMAIN
alert = [0] * len(HDD)
print(alert)


def query_drive_state(drive_id, i):
    p = subprocess.Popen(["smartctl", "-a", "/dev/" + drive_id], 
                         stdout=subprocess.PIPE)
    (output, err) = p.communicate()

    drive_results = []

    for line in output.splitlines():
        line = str(line)
        if "Reallocated_Sector_Ct" in line:
            value = line.split(" - ", 1)[1]
            if int(value) > 0:
                alert[i] += 1
            drive_results.append("Reallocated sector count*: " + 
                                  str(int(value)))
        elif "Wear_Leveling_Count" in line:
            value = re.search(" - (.+?)'", line).group(1)
            drive_results.append("Wear leveling count: " + 
                                  str(int(value)))
        elif "Reallocated_Event_Count" in line:
            value = line.split(" - ", 1)[1]
            
            # try-except is needed in case Rll_Ev_Ct value is messed up
            skip = False
            try:
                value = int(value)
            except Exception:
                skip = True
                drive_results.append("Reallocated event count*: " + 
                                      value)
            if skip is False:
                if value > 0:
                    alert[i] += 1
                    drive_results.append("Reallocated event count*: " + 
                                          str(int(value)))
        elif "Current_Pending_Sector" in line:
            value = line.split(" - ", 1)[1]
            if int(value) > 0:
                alert[i] += 1
            drive_results.append("Current pending sector*: " + 
                                  str(int(value)))
        elif "Offline_Uncorrectable" in line:
            value = line.split(" - ", 1)[1]
            if int(value) > 0:
                alert[i] += 1
            drive_results.append("Offline uncorrectable: " + 
                                  str(int(value)))
        elif "Media_Wearout_Indicator" in line:
            value = line.split(" - ", 1)[1]
            drive_results.append("Media wearout indicator: " + 
                                  str(int(value)))
        '''# DOES NOT WORK WITH PLEXTOR
        elif "Total_LBAs_Written" in line:
            value = re.search(" - (.+?)'", line).group(1)
            drive_results.append("Total LBAs written: " + str((int(value) * 512) * 1.0e-9) + " GB")
            
        elif "Total_LBAs_Read" in line:
            value = re.search(" - (.+?)'", line).group(1)
            drive_results.append("Total LBAs read: " + str((int(value) * 512) * 1.0e-9) + " GB")'''
        
    '''for element in drive_results:
        print(element)'''
    
    print('drive_results', drive_results)
    print('alert', alert)
    return drive_results


def send_alert_sendmail(TEXT, failed_drives):
    if len(failed_drives) == 0:
        SUBJECT = "Subject: " + FROM + " - Failed drives: None"
    else:
        SUBJECT = "Subject: " + FROM + " - Failed drives: " + failed_drives
    message = SUBJECT + "\n\n" + TEXT
    os.system('echo "' + message + '" | sendmail -v ' + ' '.join(TO))


def main():
    aggregated_results = []
    text = "\n######################################################"
    text += "\n### * Pre-fail attributes, replace the disk if > 0 ###"
    text += "\n######################################################\n"
    
    # Gather HDD health informations
    for i in range(0, len(HDD)):
        aggregated_results.append(query_drive_state(HDD[i], i))

    # Merge HDD health status
    i = 0
    for drive_results in aggregated_results:
        text = text + "\ndev/" + HDD[i] + "\n"
        for element in drive_results:
            text = text + element +"\n"
        i += 1

    # Prepare a list of failing HDD
    failed_drives = ""
    i = 0
    print(alert)
    for element in alert:
        if element >= 1:
            failed_drives += HDD[i]
        i += 1
    
    # Send alert email
    if len(failed_drives) > 0:
        send_alert_sendmail(text, failed_drives)


if __name__ == '__main__':
    main()

uwot.eu

Monitor hard disk SMART status :: Python