2026-04-12 20:32:34 +02:00
#!/bin/bash
# ==========================================================
# ProxMenux - SMART Disk Health & Test Tool
# ==========================================================
# Author : MacRimi
# Copyright : (c) 2024 MacRimi
# License : (GPL-3.0) (https://github.com/MacRimi/ProxMenux/blob/main/LICENSE)
# Version : 1.0
# Last Updated: 12/04/2026
# ==========================================================
# Description:
# SMART health check and disk testing tool for Proxmox VE.
# Supports SATA/SAS disks (smartmontools) and NVMe drives (nvme-cli).
# Exports results as JSON to /usr/local/share/proxmenux/smart/
# for ProxMenux Monitor integration.
# Long tests run on the drive hardware and persist after terminal close.
# ==========================================================
# Configuration ============================================
LOCAL_SCRIPTS = "/usr/local/share/proxmenux/scripts"
BASE_DIR = "/usr/local/share/proxmenux"
UTILS_FILE = " $BASE_DIR /utils.sh "
VENV_PATH = "/opt/googletrans-env"
BACKTITLE = "ProxMenux"
SMART_DIR = " $BASE_DIR /smart "
UI_MENU_H = 22
UI_MENU_W = 84
UI_MENU_LIST_H = 12
UI_SHORT_MENU_H = 16
UI_SHORT_MENU_W = 72
UI_SHORT_MENU_LIST_H = 6
UI_MSG_H = 10
UI_MSG_W = 72
UI_RESULT_H = 14
UI_RESULT_W = 86
# shellcheck source=/dev/null
[ [ -f " $UTILS_FILE " ] ] && source " $UTILS_FILE "
load_language
initialize_cache
SCRIPT_DIR_SMART = " $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " && pwd ) "
LOCAL_SCRIPTS_LOCAL = " $( cd " $SCRIPT_DIR_SMART /.. " && pwd ) "
if [ [ -f " $LOCAL_SCRIPTS_LOCAL /global/utils-install-functions.sh " ] ] ; then
source " $LOCAL_SCRIPTS_LOCAL /global/utils-install-functions.sh "
elif [ [ -f " $LOCAL_SCRIPTS /global/utils-install-functions.sh " ] ] ; then
source " $LOCAL_SCRIPTS /global/utils-install-functions.sh "
fi
# Configuration ============================================
# ==========================================================
# Helpers
# ==========================================================
_smart_is_nvme( ) {
[ [ " $1 " = = *nvme* ] ]
}
_smart_disk_label( ) {
local disk = " $1 "
local model size
model = $( lsblk -dn -o MODEL " $disk " 2>/dev/null | xargs)
size = $( lsblk -dn -o SIZE " $disk " 2>/dev/null | xargs)
[ [ -z " $model " ] ] && model = "Unknown"
[ [ -z " $size " ] ] && size = "?"
printf '%-8s — %s' " $size " " $model "
}
_smart_json_path( ) {
local disk = " $1 "
2026-04-13 14:49:48 +02:00
local test_type = " ${ 2 :- short } "
local disk_name
disk_name = $( basename " $disk " )
local disk_dir = " ${ SMART_DIR } / ${ disk_name } "
local timestamp
timestamp = $( date +%Y-%m-%dT%H-%M-%S)
# Create disk directory if it doesn't exist
mkdir -p " $disk_dir "
echo " ${ disk_dir } / ${ timestamp } _ ${ test_type } .json "
}
_smart_get_latest_json( ) {
local disk = " $1 "
local disk_name
disk_name = $( basename " $disk " )
local disk_dir = " ${ SMART_DIR } / ${ disk_name } "
if [ [ -d " $disk_dir " ] ] ; then
# Get most recent JSON file (sorted by name = sorted by timestamp)
ls -1 " ${ disk_dir } " /*.json 2>/dev/null | sort -r | head -1
fi
}
_smart_cleanup_old_jsons( ) {
local disk = " $1 "
local retention = " ${ 2 :- 10 } " # Default: keep last 10
local disk_name
disk_name = $( basename " $disk " )
local disk_dir = " ${ SMART_DIR } / ${ disk_name } "
if [ [ -d " $disk_dir " && " $retention " -gt 0 ] ] ; then
# List all JSON files sorted by name (oldest last), skip first $retention, delete rest
ls -1 " ${ disk_dir } " /*.json 2>/dev/null | sort -r | tail -n +$(( retention + 1 )) | xargs -r rm -f
fi
2026-04-12 20:32:34 +02:00
}
_smart_ensure_packages( ) {
local need_smartctl = 0 need_nvme = 0
command -v smartctl >/dev/null 2>& 1 || need_smartctl = 1
command -v nvme >/dev/null 2>& 1 || need_nvme = 1
if [ [ $need_smartctl -eq 1 || $need_nvme -eq 1 ] ] ; then
show_proxmenux_logo
msg_title " $( translate 'SMART Disk Health & Test' ) "
ensure_repositories
[ [ $need_smartctl -eq 1 ] ] && install_single_package "smartmontools" "smartctl" "SMART monitoring tools"
[ [ $need_nvme -eq 1 ] ] && install_single_package "nvme-cli" "nvme" "NVMe management tools"
fi
}
# ==========================================================
# PHASE 1 — SELECTION
# All dialogs run here. No execution, no show_proxmenux_logo.
# ==========================================================
# ── Install packages if missing ───────────────────────────
_smart_ensure_packages
# ── Step 1: Detect disks ──────────────────────────────────
DISK_OPTIONS = ( )
while read -r disk; do
[ [ -z " $disk " ] ] && continue
[ [ " $disk " = ~ ^/dev/zd ] ] && continue
label = $( _smart_disk_label " $disk " )
DISK_OPTIONS += ( " $disk " " $label " )
done < <( lsblk -dn -e 7,11 -o PATH 2>/dev/null | grep -E '^/dev/(sd|nvme|vd|hd)' )
stop_spinner
if [ [ ${# DISK_OPTIONS [@] } -eq 0 ] ] ; then
dialog --backtitle " $BACKTITLE " \
--title " $( translate 'No Disks Found' ) " \
--msgbox " \n $( translate 'No physical disks detected for SMART testing.' ) " \
$UI_MSG_H $UI_MSG_W
exit 1
fi
# ── Step 2: Select disk ───────────────────────────────────
SELECTED_DISK = $( dialog --backtitle " $BACKTITLE " \
--title " $( translate 'Select Disk' ) " \
--menu " \n $( translate 'Select the disk to test or inspect:' ) " \
$UI_MENU_H $UI_MENU_W $UI_MENU_LIST_H \
" ${ DISK_OPTIONS [@] } " \
2>& 1 >/dev/tty)
[ [ -z " $SELECTED_DISK " ] ] && exit 0
# ── Steps 3+: Action loop for the selected disk ───────────
DISK_LABEL = $( _smart_disk_label " $SELECTED_DISK " )
mkdir -p " $SMART_DIR "
while true; do
# ── Select action ───────────────────────────────────────
ACTION = $( dialog --backtitle " $BACKTITLE " \
--title " $( translate 'SMART Action' ) — $( basename " $SELECTED_DISK " ) ( ${ DISK_LABEL } ) " \
--menu " \n $( translate 'Select what to do with this disk:' ) " \
$UI_MENU_H $UI_MENU_W $UI_MENU_LIST_H \
"status" " $( translate 'Quick health status — overall SMART result + key attributes' ) " \
"report" " $( translate 'Full report — complete SMART data (scrollable)' ) " \
"short" " $( translate 'Short test — ~2 minutes, basic surface check' ) " \
"long" " $( translate 'Long test — full scan, runs in background if closed' ) " \
"progress" " $( translate 'Check test progress — show active or last test result' ) " \
2>& 1 >/dev/tty)
[ [ -z " $ACTION " ] ] && exit 0
# ── Long test confirmation ───────────────────────────────
if [ [ " $ACTION " = = "long" ] ] ; then
DISK_SIZE = $( lsblk -dn -o SIZE " $SELECTED_DISK " 2>/dev/null | xargs)
if ! dialog --backtitle " $BACKTITLE " \
--title " $( translate 'Long Test — Background' ) " \
2026-04-13 14:49:48 +02:00
--yesno " \n $( translate 'The long test runs directly on the disk hardware.' ) \n\n $( translate 'Disk:' ) $SELECTED_DISK ( $DISK_SIZE )\n\n $( translate 'The test will continue even if you close this terminal.' ) \n $( translate 'Results will be saved automatically to:' ) \n $( _smart_json_path " $SELECTED_DISK " "long" ) \n\n $( translate 'Start long test now?' ) " \
2026-04-12 20:32:34 +02:00
16 $UI_RESULT_W ; then
continue
fi
fi
# ========================================================
# PHASE 2 — EXECUTION
# show_proxmenux_logo appears here exactly once per action.
# No dialogs from this point until "Press Enter".
# ========================================================
show_proxmenux_logo
msg_title " $( translate 'SMART Disk Health & Test' ) "
msg_ok " $( translate 'Disk:' ) ${ BL } ${ SELECTED_DISK } — ${ DISK_LABEL } ${ CL } "
echo ""
case " $ACTION " in
# ── Quick status ────────────────────────────────────────
status)
if _smart_is_nvme " $SELECTED_DISK " ; then
msg_info " $( translate 'Reading NVMe SMART data...' ) "
OUTPUT = $( nvme smart-log " $SELECTED_DISK " 2>/dev/null)
stop_spinner
if [ [ -z " $OUTPUT " ] ] ; then
msg_error " $( translate 'Could not read SMART data from' ) $SELECTED_DISK "
else
HEALTH = $( echo " $OUTPUT " | grep -i "critical_warning" | awk '{print $NF}' )
if [ [ " $HEALTH " = = "0" ] ] ; then
msg_ok " $( translate 'NVMe health status: PASSED' ) "
else
msg_warn " $( translate 'NVMe health status: WARNING (critical_warning =' ) $HEALTH ) "
fi
echo ""
echo " $OUTPUT " | head -20
fi
else
msg_info " $( translate 'Reading SMART data...' ) "
HEALTH = $( smartctl -H " $SELECTED_DISK " 2>/dev/null | grep -i "overall-health" )
ATTRS = $( smartctl -A " $SELECTED_DISK " 2>/dev/null)
stop_spinner
if [ [ -z " $HEALTH " ] ] ; then
msg_error " $( translate 'Could not read SMART data from' ) $SELECTED_DISK "
else
if echo " $HEALTH " | grep -qi "PASSED" ; then
msg_ok " $( translate 'SMART health status: PASSED' ) "
else
msg_warn " $HEALTH "
fi
echo ""
echo " $ATTRS " | awk 'NR==1 || /Reallocated_Sector|Current_Pending|Uncorrectable|Temperature_Celsius|Power_On_Hours|Wear_Leveling|Media_Wearout/'
fi
fi
; ;
# ── Full report (scrollable) ────────────────────────────
report)
msg_info " $( translate 'Reading full SMART report...' ) "
TMPFILE = $( mktemp)
if _smart_is_nvme " $SELECTED_DISK " ; then
nvme smart-log " $SELECTED_DISK " > " $TMPFILE " 2>/dev/null
nvme id-ctrl " $SELECTED_DISK " >> " $TMPFILE " 2>/dev/null
else
smartctl -x " $SELECTED_DISK " > " $TMPFILE " 2>/dev/null
fi
stop_spinner
if [ [ -s " $TMPFILE " ] ] ; then
dialog --backtitle " $BACKTITLE " \
--title " $( translate 'Full SMART Report' ) — $SELECTED_DISK " \
--textbox " $TMPFILE " 40 $UI_RESULT_W
else
msg_error " $( translate 'Could not read SMART data from' ) $SELECTED_DISK "
fi
rm -f " $TMPFILE "
; ;
# ── Short test ──────────────────────────────────────────
short)
if _smart_is_nvme " $SELECTED_DISK " ; then
msg_info " $( translate 'Starting NVMe short self-test...' ) "
if nvme device-self-test " $SELECTED_DISK " --self-test-code= 1 >/dev/null 2>& 1; then
stop_spinner
msg_ok " $( translate 'Short self-test started on' ) $SELECTED_DISK "
msg_ok " $( translate 'Test typically completes in ~2 minutes.' ) "
msg_ok " $( translate 'Use "Check test progress" to see results.' ) "
else
stop_spinner
msg_error " $( translate 'Failed to start self-test on' ) $SELECTED_DISK "
fi
else
msg_info " $( translate 'Starting SMART short self-test...' ) "
OUTPUT = $( smartctl -t short " $SELECTED_DISK " 2>/dev/null)
stop_spinner
if echo " $OUTPUT " | grep -qi "Test will complete" ; then
msg_ok " $( translate 'Short self-test started on' ) $SELECTED_DISK "
ESTIMATE = $( echo " $OUTPUT " | grep -i "complete after" | head -1)
[ [ -n " $ESTIMATE " ] ] && msg_ok " $ESTIMATE "
msg_ok " $( translate 'Use "Check test progress" to see results.' ) "
else
msg_error " $( translate 'Failed to start self-test on' ) $SELECTED_DISK "
echo " $OUTPUT " | tail -5
fi
fi
; ;
2026-04-13 14:49:48 +02:00
# ── Long test (background) ──────────────────────────────
long)
JSON_PATH = $( _smart_json_path " $SELECTED_DISK " "long" )
_smart_cleanup_old_jsons " $SELECTED_DISK "
2026-04-12 20:32:34 +02:00
DISK_SAFE = $( printf '%q' " $SELECTED_DISK " )
JSON_SAFE = $( printf '%q' " $JSON_PATH " )
if _smart_is_nvme " $SELECTED_DISK " ; then
msg_info " $( translate 'Starting NVMe long self-test...' ) "
if nvme device-self-test " $SELECTED_DISK " --self-test-code= 2 >/dev/null 2>& 1; then
stop_spinner
msg_ok " $( translate 'Long self-test started on' ) $SELECTED_DISK "
DISK_LABEL_SAFE = $( printf '%q' " $DISK_LABEL " )
NOTIFY_SCRIPT = "/usr/bin/notification_manager.py"
nohup bash -c "
while nvme device-self-test ${ DISK_SAFE } --self-test-code= 0 2>/dev/null | grep -qi 'in progress' ; do
sleep 60
done
nvme smart-log -o json ${ DISK_SAFE } > ${ JSON_SAFE } 2>/dev/null
# Send notification when test completes
if [ [ -f \" ${ NOTIFY_SCRIPT } \" ] ] ; then
HOSTNAME = \$ ( hostname -s)
TEST_RESULT = \$ ( nvme self-test-log ${ DISK_SAFE } 2>/dev/null | head -20)
if echo \" \$ TEST_RESULT\" | grep -qi 'completed without error\|success' ; then
python3 \" ${ NOTIFY_SCRIPT } \" --action send-raw --severity INFO \
--title \" \$ { HOSTNAME} : SMART Long Test Completed\" \
--message \" NVMe disk ${ DISK_SAFE } ( ${ DISK_LABEL_SAFE } ) - Long self-test completed successfully.\" 2>/dev/null || true
else
python3 \" ${ NOTIFY_SCRIPT } \" --action send-raw --severity WARNING \
--title \" \$ { HOSTNAME} : SMART Long Test Completed\" \
--message \" NVMe disk ${ DISK_SAFE } ( ${ DISK_LABEL_SAFE } ) - Long self-test completed. Check results for details.\" 2>/dev/null || true
fi
fi
" >/dev/null 2>&1 &
disown $!
else
stop_spinner
msg_error " $( translate 'Failed to start long self-test on' ) $SELECTED_DISK "
fi
else
msg_info " $( translate 'Starting SMART long self-test...' ) "
OUTPUT = $( smartctl -t long " $SELECTED_DISK " 2>/dev/null)
stop_spinner
if echo " $OUTPUT " | grep -qi "Test will complete" ; then
msg_ok " $( translate 'Long self-test started on' ) $SELECTED_DISK "
ESTIMATE = $( echo " $OUTPUT " | grep -i "complete after" | head -1)
[ [ -n " $ESTIMATE " ] ] && msg_ok " $ESTIMATE "
echo ""
msg_ok " $( translate 'Test runs on the drive hardware — safe to close this terminal.' ) "
msg_ok " $( translate 'Results will be saved to:' ) $JSON_PATH "
DISK_LABEL_SAFE = $( printf '%q' " $DISK_LABEL " )
NOTIFY_SCRIPT = "/usr/bin/notification_manager.py"
nohup bash -c "
while smartctl -c ${ DISK_SAFE } 2>/dev/null | grep -qiE 'Self-test routine in progress|[1-9][0-9]?% of test remaining' ; do
sleep 60
done
2026-04-13 14:49:48 +02:00
smartctl -a --json= c ${ DISK_SAFE } > ${ JSON_SAFE } 2>/dev/null
2026-04-12 20:32:34 +02:00
# Send notification when test completes
if [ [ -f \" ${ NOTIFY_SCRIPT } \" ] ] ; then
HOSTNAME = \$ ( hostname -s)
TEST_RESULT = \$ ( smartctl -l selftest ${ DISK_SAFE } 2>/dev/null | grep -E '^# ?1' )
if echo \" \$ TEST_RESULT\" | grep -qi 'Completed without error' ; then
python3 \" ${ NOTIFY_SCRIPT } \" --action send-raw --severity INFO \
--title \" \$ { HOSTNAME} : SMART Long Test Completed\" \
--message \" Disk ${ DISK_SAFE } ( ${ DISK_LABEL_SAFE } ) - Long self-test completed successfully.\" 2>/dev/null || true
elif echo \" \$ TEST_RESULT\" | grep -qi 'error\|fail' ; then
python3 \" ${ NOTIFY_SCRIPT } \" --action send-raw --severity CRITICAL \
--title \" \$ { HOSTNAME} : SMART Long Test FAILED\" \
--message \" Disk ${ DISK_SAFE } ( ${ DISK_LABEL_SAFE } ) - Long self-test completed with ERRORS. Check disk health immediately.\" 2>/dev/null || true
else
python3 \" ${ NOTIFY_SCRIPT } \" --action send-raw --severity INFO \
--title \" \$ { HOSTNAME} : SMART Long Test Completed\" \
--message \" Disk ${ DISK_SAFE } ( ${ DISK_LABEL_SAFE } ) - Long self-test completed. Check results for details.\" 2>/dev/null || true
fi
fi
" >/dev/null 2>&1 &
disown $!
else
msg_error " $( translate 'Failed to start long self-test on' ) $SELECTED_DISK "
echo " $OUTPUT " | tail -5
fi
fi
; ;
# ── Check progress ──────────────────────────────────────
progress)
if _smart_is_nvme " $SELECTED_DISK " ; then
msg_info " $( translate 'Reading NVMe self-test log...' ) "
OUTPUT = $( nvme self-test-log " $SELECTED_DISK " 2>/dev/null)
stop_spinner
if [ [ -z " $OUTPUT " ] ] ; then
msg_warn " $( translate 'No self-test log available for' ) $SELECTED_DISK "
else
echo " $OUTPUT " | head -30
fi
else
msg_info " $( translate 'Reading SMART self-test log...' ) "
# Active test: only "X% of test remaining" appears when a test is actually running
ACTIVE = $( smartctl -c " $SELECTED_DISK " 2>/dev/null | grep -iE "[1-9][0-9]?% of test remaining|Self-test routine in progress" )
# Log: grab only result rows (^# N ...) and the column header (^Num)
LOG_OUT = $( smartctl -l selftest " $SELECTED_DISK " 2>/dev/null)
LOG_HEADER = $( echo " $LOG_OUT " | grep -E "^Num" )
LOG_ENTRIES = $( echo " $LOG_OUT " | grep -E "^# ?[0-9]" )
stop_spinner
if [ [ -n " $ACTIVE " ] ] ; then
msg_ok " $( translate 'Test in progress:' ) "
echo " $ACTIVE "
echo ""
else
msg_ok " $( translate 'No test currently running' ) "
echo ""
fi
if [ [ -n " $LOG_ENTRIES " ] ] ; then
msg_ok " $( translate 'Recent test results:' ) "
[ [ -n " $LOG_HEADER " ] ] && echo " $LOG_HEADER "
echo " $LOG_ENTRIES "
else
msg_warn " $( translate 'No self-test history found for' ) $SELECTED_DISK "
fi
fi
; ;
esac
# ── Auto-export JSON (except long — handled by background monitor)
if [ [ " $ACTION " != "long" && " $ACTION " != "report" ] ] ; then
2026-04-13 14:49:48 +02:00
# Determine test type from ACTION (short test or status check)
local json_test_type = "short"
[ [ " $ACTION " = = "status" ] ] && json_test_type = "status"
JSON_PATH = $( _smart_json_path " $SELECTED_DISK " " $json_test_type " )
_smart_cleanup_old_jsons " $SELECTED_DISK "
2026-04-12 20:32:34 +02:00
if _smart_is_nvme " $SELECTED_DISK " ; then
nvme smart-log -o json " $SELECTED_DISK " > " $JSON_PATH " 2>/dev/null
else
2026-04-13 14:49:48 +02:00
smartctl -a --json= c " $SELECTED_DISK " > " $JSON_PATH " 2>/dev/null
2026-04-12 20:32:34 +02:00
fi
[ [ -s " $JSON_PATH " ] ] || rm -f " $JSON_PATH "
fi
# ── "report" uses dialog --textbox, no Press Enter needed
if [ [ " $ACTION " != "report" ] ] ; then
echo ""
msg_success " $( translate 'Press Enter to continue...' ) "
read -r
fi
done