1
0
Fork 0
mirror of https://github.com/munin-monitoring/contrib.git synced 2025-07-22 02:51:03 +00:00

nvme: add graph for spare capacity. add support for limits

The default limits are sort of randomly chosen.  My nvme drives report
a warning level at 10% spare capacity, so I kept that here.
This commit is contained in:
Kjetil Torgrim Homme 2021-03-10 23:03:20 +01:00 committed by Lars Kruse
parent 4c1903fe29
commit c7299aeba2

View file

@ -16,9 +16,13 @@ The plugin uses nvme(1) from the nvme-cli project to read status from
the NVMe devices. This requires root access. the NVMe devices. This requires root access.
[nvme] [nvme]
user root user root
The plugin does not support alerting. When setting alert levels per device, use graph and basename of device
name, e.g., 'nvme0n1', to make environment variable:
env.nvme_usage_nvme0n1_warning 5:
env.nvme_usage_warning 8:
=head1 INTERPRETATION =head1 INTERPRETATION
@ -31,6 +35,8 @@ This reports how much of capacity is allocated in each NVMe
relation to actual use, e.g., if deleted data areas have not been relation to actual use, e.g., if deleted data areas have not been
trimmed/discarded. trimmed/discarded.
Default warning and critical: '95', '98'
=head2 nvme_bytes =head2 nvme_bytes
This reports read and write activity on each NVMe device, in bytes per This reports read and write activity on each NVMe device, in bytes per
@ -43,6 +49,8 @@ It is a good idea to compare these numbers to I/O counters from
diskstats. If they are much higher, look into whether the write diskstats. If they are much higher, look into whether the write
amplification can be due to suboptimal I/O request sizes. amplification can be due to suboptimal I/O request sizes.
This graph does not support alerting.
=head2 nvme_writecycles =head2 nvme_writecycles
This graphs is intended to give an indication of how much life there This graphs is intended to give an indication of how much life there
@ -54,6 +62,16 @@ experienced.
A prosumer NVMe will handle a few thousand writes to each cell before A prosumer NVMe will handle a few thousand writes to each cell before
the error rate gets out of hand. the error rate gets out of hand.
No default values for warning and critical.
=head2 nvme_spare
All NVMe has set a side reserve space to remap media errors. This
graphs how much is left in percent, taken directly from smart-log
output.
Default warning and critical: '10:', '3:'
=head1 MAGIC MARKERS =head1 MAGIC MARKERS
#%# family=auto #%# family=auto
@ -65,7 +83,7 @@ None known.
=head1 VERSION =head1 VERSION
1.0 1.1
=head1 AUTHOR =head1 AUTHOR
@ -80,6 +98,7 @@ GPLv2
use strict; use strict;
use Munin::Plugin; use Munin::Plugin;
use IPC::Cmd qw(can_run); use IPC::Cmd qw(can_run);
use File::Basename;
# Check that multigraph is supported # Check that multigraph is supported
need_multigraph(); need_multigraph();
@ -169,6 +188,15 @@ sub smart_log {
return \%info; return \%info;
} }
sub my_print_thresholds {
my ($label, $graph, $device, $warn_default, $crit_default) = @_;
my $dev = basename($device);
my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
$warn_default, $crit_default);
print "${label}.warning $warn\n" if defined $warn;
print "${label}.critical $crit\n" if defined $crit;
}
use Data::Dumper; use Data::Dumper;
my $mode = ($ARGV[0] or "print"); my $mode = ($ARGV[0] or "print");
@ -200,12 +228,14 @@ graph_category disk
graph_info How much space is used graph_info How much space is used
EOF EOF
for (@sn) { for (@sn) {
my $device = $list->{$_}->{device};
print <<"EOF"; print <<"EOF";
$_.label $list->{$_}->{device} used $_.label $device used
$_.type GAUGE $_.type GAUGE
$_.max 100 $_.max 100
$_.min 0 $_.min 0
EOF EOF
my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
} }
print <<'EOF'; print <<'EOF';
multigraph nvme_bytes multigraph nvme_bytes
@ -238,11 +268,31 @@ graph_category disk
graph_info How much data has been written in lifetime divided by capacity graph_info How much data has been written in lifetime divided by capacity
EOF EOF
for (@sn) { for (@sn) {
my $device = $list->{$_}->{device};
print <<"EOF"; print <<"EOF";
$_.label $list->{$_}->{device} write cycles $_.label $device write cycles
$_.type GAUGE $_.type GAUGE
$_.min 0 $_.min 0
EOF EOF
my_print_thresholds($_, 'nvme_writecycles', $device);
}
print <<'EOF';
multigraph nvme_spare
graph_title Available spare blocks
graph_order $sn_list
graph_vlabel Percent
graph_category disk
graph_info Spare capacity for replacing bad blocks
EOF
for (@sn) {
my $device = $list->{$_}->{device};
print <<"EOF";
$_.label $device spare capacity
$_.type GAUGE
$_.min 0
$_.max 100
EOF
my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
} }
} else { } else {
for (@sn) { for (@sn) {
@ -270,4 +320,13 @@ EOF
my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity}; my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
print "$_.value $cycles\n"; print "$_.value $cycles\n";
} }
print "multigraph nvme_spare\n";
for (@sn) {
my $info = $list->{$_};
# The unit size reported is 1000 blocks.
my $spare = $info->{smart}->{available_spare};
$spare =~ s/%//;
print "$_.value $spare\n";
}
} }