From c7299aeba27dd9468293735e6b5ea73a4942d278 Mon Sep 17 00:00:00 2001 From: Kjetil Torgrim Homme Date: Wed, 10 Mar 2021 23:03:20 +0100 Subject: [PATCH] nvme: add graph for spare capacity. add support for limits The default limits are sort of randomly chosen. My nvme drives report a warning level at 10% spare capacity, so I kept that here. --- plugins/disk/nvme | 69 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/plugins/disk/nvme b/plugins/disk/nvme index cca0e453..18e39e89 100755 --- a/plugins/disk/nvme +++ b/plugins/disk/nvme @@ -16,9 +16,13 @@ The plugin uses nvme(1) from the nvme-cli project to read status from the NVMe devices. This requires root access. [nvme] - user root + user root -The plugin does not support alerting. +When setting alert levels per device, use graph and basename of device +name, e.g., 'nvme0n1', to make environment variable: + + env.nvme_usage_nvme0n1_warning 5: + env.nvme_usage_warning 8: =head1 INTERPRETATION @@ -31,6 +35,8 @@ This reports how much of capacity is allocated in each NVMe relation to actual use, e.g., if deleted data areas have not been trimmed/discarded. +Default warning and critical: '95', '98' + =head2 nvme_bytes This reports read and write activity on each NVMe device, in bytes per @@ -43,6 +49,8 @@ It is a good idea to compare these numbers to I/O counters from diskstats. If they are much higher, look into whether the write amplification can be due to suboptimal I/O request sizes. +This graph does not support alerting. + =head2 nvme_writecycles This graphs is intended to give an indication of how much life there @@ -54,6 +62,16 @@ experienced. A prosumer NVMe will handle a few thousand writes to each cell before the error rate gets out of hand. +No default values for warning and critical. + +=head2 nvme_spare + +All NVMe has set a side reserve space to remap media errors. This +graphs how much is left in percent, taken directly from smart-log +output. + +Default warning and critical: '10:', '3:' + =head1 MAGIC MARKERS #%# family=auto @@ -65,7 +83,7 @@ None known. =head1 VERSION - 1.0 + 1.1 =head1 AUTHOR @@ -80,6 +98,7 @@ GPLv2 use strict; use Munin::Plugin; use IPC::Cmd qw(can_run); +use File::Basename; # Check that multigraph is supported need_multigraph(); @@ -169,6 +188,15 @@ sub smart_log { return \%info; } +sub my_print_thresholds { + my ($label, $graph, $device, $warn_default, $crit_default) = @_; + my $dev = basename($device); + my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical", + $warn_default, $crit_default); + print "${label}.warning $warn\n" if defined $warn; + print "${label}.critical $crit\n" if defined $crit; +} + use Data::Dumper; my $mode = ($ARGV[0] or "print"); @@ -200,12 +228,14 @@ graph_category disk graph_info How much space is used EOF for (@sn) { + my $device = $list->{$_}->{device}; print <<"EOF"; -$_.label $list->{$_}->{device} used +$_.label $device used $_.type GAUGE $_.max 100 $_.min 0 EOF + my_print_thresholds($_, 'nvme_usage', $device, '95', '98'); } print <<'EOF'; multigraph nvme_bytes @@ -238,11 +268,31 @@ graph_category disk graph_info How much data has been written in lifetime divided by capacity EOF for (@sn) { + my $device = $list->{$_}->{device}; print <<"EOF"; -$_.label $list->{$_}->{device} write cycles +$_.label $device write cycles $_.type GAUGE $_.min 0 EOF + my_print_thresholds($_, 'nvme_writecycles', $device); + } + print <<'EOF'; +multigraph nvme_spare +graph_title Available spare blocks +graph_order $sn_list +graph_vlabel Percent +graph_category disk +graph_info Spare capacity for replacing bad blocks +EOF + for (@sn) { + my $device = $list->{$_}->{device}; + print <<"EOF"; +$_.label $device spare capacity +$_.type GAUGE +$_.min 0 +$_.max 100 +EOF + my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:'); } } else { for (@sn) { @@ -270,4 +320,13 @@ EOF my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity}; print "$_.value $cycles\n"; } + print "multigraph nvme_spare\n"; + for (@sn) { + my $info = $list->{$_}; + + # The unit size reported is 1000 blocks. + my $spare = $info->{smart}->{available_spare}; + $spare =~ s/%//; + print "$_.value $spare\n"; + } }