mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-22 02:51:03 +00:00
nvme: add graph for spare capacity. add support for limits
The default limits are sort of randomly chosen. My nvme drives report a warning level at 10% spare capacity, so I kept that here.
This commit is contained in:
parent
4c1903fe29
commit
c7299aeba2
1 changed files with 64 additions and 5 deletions
|
@ -18,7 +18,11 @@ the NVMe devices. This requires root access.
|
||||||
[nvme]
|
[nvme]
|
||||||
user root
|
user root
|
||||||
|
|
||||||
The plugin does not support alerting.
|
When setting alert levels per device, use graph and basename of device
|
||||||
|
name, e.g., 'nvme0n1', to make environment variable:
|
||||||
|
|
||||||
|
env.nvme_usage_nvme0n1_warning 5:
|
||||||
|
env.nvme_usage_warning 8:
|
||||||
|
|
||||||
=head1 INTERPRETATION
|
=head1 INTERPRETATION
|
||||||
|
|
||||||
|
@ -31,6 +35,8 @@ This reports how much of capacity is allocated in each NVMe
|
||||||
relation to actual use, e.g., if deleted data areas have not been
|
relation to actual use, e.g., if deleted data areas have not been
|
||||||
trimmed/discarded.
|
trimmed/discarded.
|
||||||
|
|
||||||
|
Default warning and critical: '95', '98'
|
||||||
|
|
||||||
=head2 nvme_bytes
|
=head2 nvme_bytes
|
||||||
|
|
||||||
This reports read and write activity on each NVMe device, in bytes per
|
This reports read and write activity on each NVMe device, in bytes per
|
||||||
|
@ -43,6 +49,8 @@ It is a good idea to compare these numbers to I/O counters from
|
||||||
diskstats. If they are much higher, look into whether the write
|
diskstats. If they are much higher, look into whether the write
|
||||||
amplification can be due to suboptimal I/O request sizes.
|
amplification can be due to suboptimal I/O request sizes.
|
||||||
|
|
||||||
|
This graph does not support alerting.
|
||||||
|
|
||||||
=head2 nvme_writecycles
|
=head2 nvme_writecycles
|
||||||
|
|
||||||
This graphs is intended to give an indication of how much life there
|
This graphs is intended to give an indication of how much life there
|
||||||
|
@ -54,6 +62,16 @@ experienced.
|
||||||
A prosumer NVMe will handle a few thousand writes to each cell before
|
A prosumer NVMe will handle a few thousand writes to each cell before
|
||||||
the error rate gets out of hand.
|
the error rate gets out of hand.
|
||||||
|
|
||||||
|
No default values for warning and critical.
|
||||||
|
|
||||||
|
=head2 nvme_spare
|
||||||
|
|
||||||
|
All NVMe has set a side reserve space to remap media errors. This
|
||||||
|
graphs how much is left in percent, taken directly from smart-log
|
||||||
|
output.
|
||||||
|
|
||||||
|
Default warning and critical: '10:', '3:'
|
||||||
|
|
||||||
=head1 MAGIC MARKERS
|
=head1 MAGIC MARKERS
|
||||||
|
|
||||||
#%# family=auto
|
#%# family=auto
|
||||||
|
@ -65,7 +83,7 @@ None known.
|
||||||
|
|
||||||
=head1 VERSION
|
=head1 VERSION
|
||||||
|
|
||||||
1.0
|
1.1
|
||||||
|
|
||||||
=head1 AUTHOR
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
@ -80,6 +98,7 @@ GPLv2
|
||||||
use strict;
|
use strict;
|
||||||
use Munin::Plugin;
|
use Munin::Plugin;
|
||||||
use IPC::Cmd qw(can_run);
|
use IPC::Cmd qw(can_run);
|
||||||
|
use File::Basename;
|
||||||
|
|
||||||
# Check that multigraph is supported
|
# Check that multigraph is supported
|
||||||
need_multigraph();
|
need_multigraph();
|
||||||
|
@ -169,6 +188,15 @@ sub smart_log {
|
||||||
return \%info;
|
return \%info;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sub my_print_thresholds {
|
||||||
|
my ($label, $graph, $device, $warn_default, $crit_default) = @_;
|
||||||
|
my $dev = basename($device);
|
||||||
|
my ($warn, $crit) = get_thresholds($graph, "${graph}_${dev}_warning", "${graph}_${dev}_critical",
|
||||||
|
$warn_default, $crit_default);
|
||||||
|
print "${label}.warning $warn\n" if defined $warn;
|
||||||
|
print "${label}.critical $crit\n" if defined $crit;
|
||||||
|
}
|
||||||
|
|
||||||
use Data::Dumper;
|
use Data::Dumper;
|
||||||
|
|
||||||
my $mode = ($ARGV[0] or "print");
|
my $mode = ($ARGV[0] or "print");
|
||||||
|
@ -200,12 +228,14 @@ graph_category disk
|
||||||
graph_info How much space is used
|
graph_info How much space is used
|
||||||
EOF
|
EOF
|
||||||
for (@sn) {
|
for (@sn) {
|
||||||
|
my $device = $list->{$_}->{device};
|
||||||
print <<"EOF";
|
print <<"EOF";
|
||||||
$_.label $list->{$_}->{device} used
|
$_.label $device used
|
||||||
$_.type GAUGE
|
$_.type GAUGE
|
||||||
$_.max 100
|
$_.max 100
|
||||||
$_.min 0
|
$_.min 0
|
||||||
EOF
|
EOF
|
||||||
|
my_print_thresholds($_, 'nvme_usage', $device, '95', '98');
|
||||||
}
|
}
|
||||||
print <<'EOF';
|
print <<'EOF';
|
||||||
multigraph nvme_bytes
|
multigraph nvme_bytes
|
||||||
|
@ -238,11 +268,31 @@ graph_category disk
|
||||||
graph_info How much data has been written in lifetime divided by capacity
|
graph_info How much data has been written in lifetime divided by capacity
|
||||||
EOF
|
EOF
|
||||||
for (@sn) {
|
for (@sn) {
|
||||||
|
my $device = $list->{$_}->{device};
|
||||||
print <<"EOF";
|
print <<"EOF";
|
||||||
$_.label $list->{$_}->{device} write cycles
|
$_.label $device write cycles
|
||||||
$_.type GAUGE
|
$_.type GAUGE
|
||||||
$_.min 0
|
$_.min 0
|
||||||
EOF
|
EOF
|
||||||
|
my_print_thresholds($_, 'nvme_writecycles', $device);
|
||||||
|
}
|
||||||
|
print <<'EOF';
|
||||||
|
multigraph nvme_spare
|
||||||
|
graph_title Available spare blocks
|
||||||
|
graph_order $sn_list
|
||||||
|
graph_vlabel Percent
|
||||||
|
graph_category disk
|
||||||
|
graph_info Spare capacity for replacing bad blocks
|
||||||
|
EOF
|
||||||
|
for (@sn) {
|
||||||
|
my $device = $list->{$_}->{device};
|
||||||
|
print <<"EOF";
|
||||||
|
$_.label $device spare capacity
|
||||||
|
$_.type GAUGE
|
||||||
|
$_.min 0
|
||||||
|
$_.max 100
|
||||||
|
EOF
|
||||||
|
my_print_thresholds($_, 'nvme_spare', $device, '10:', '3:');
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (@sn) {
|
for (@sn) {
|
||||||
|
@ -270,4 +320,13 @@ EOF
|
||||||
my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
|
my $cycles = $info->{smart}->{data_units_written} * 512_000 / $info->{capacity};
|
||||||
print "$_.value $cycles\n";
|
print "$_.value $cycles\n";
|
||||||
}
|
}
|
||||||
|
print "multigraph nvme_spare\n";
|
||||||
|
for (@sn) {
|
||||||
|
my $info = $list->{$_};
|
||||||
|
|
||||||
|
# The unit size reported is 1000 blocks.
|
||||||
|
my $spare = $info->{smart}->{available_spare};
|
||||||
|
$spare =~ s/%//;
|
||||||
|
print "$_.value $spare\n";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue