warning
1.3.1. SMART device temperature warning
Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C
- alert: SMARTDeviceTemperatureWarning
expr: (avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60
for: 0m
labels:
severity: warning
annotations:
summary: SMART device temperature warning (instance {{ $labels.instance }})
description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" critical
1.3.2. SMART device temperature critical
Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
- alert: SMARTDeviceTemperatureCritical
expr: (max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70
for: 0m
labels:
severity: critical
annotations:
summary: SMART device temperature critical (instance {{ $labels.instance }})
description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" critical
1.3.3. SMART device temperature over trip value
Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}
- alert: SMARTDeviceTemperatureOverTripValue
expr: max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}
for: 0m
labels:
severity: critical
annotations:
summary: SMART device temperature over trip value (instance {{ $labels.instance }})
description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" warning
1.3.4. SMART device temperature nearing trip value
Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}
- alert: SMARTDeviceTemperatureNearingTripValue
expr: max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)
for: 0m
labels:
severity: warning
annotations:
summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" critical
1.3.5. SMART status
Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}
- alert: SMARTStatus
expr: smartctl_device_smart_status != 1
for: 0m
labels:
severity: critical
annotations:
summary: SMART status (instance {{ $labels.instance }})
description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" critical
1.3.6. SMART critical warning
Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }}
- alert: SMARTCriticalWarning
expr: smartctl_device_critical_warning > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" critical
1.3.7. SMART media errors
Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}
- alert: SMARTMediaErrors
expr: smartctl_device_media_errors > 0
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" critical
1.3.8. SMART Wearout Indicator
Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}
- alert: SMARTWearoutIndicator
expr: smartctl_device_available_spare < smartctl_device_available_spare_threshold
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"