mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-24 18:07:20 +00:00
nvme - Munin plugin to monitor the use of NVMe devices
This is a multigraph plugin which makes three graphs, nvme_usage, nvme_bytes and nvme_writecycles
This commit is contained in:
parent
9b3aa2671b
commit
cca33b3984
1 changed files with 244 additions and 0 deletions
244
plugins/disk/nvme
Executable file
244
plugins/disk/nvme
Executable file
|
@ -0,0 +1,244 @@
|
|||
#! /usr/bin/perl -w
|
||||
# -*- perl -*-
|
||||
|
||||
=head1 NAME
|
||||
|
||||
nvme - Munin plugin to monitor the use of NVMe devices
|
||||
|
||||
=head1 CONFIGURATION
|
||||
|
||||
The plugin uses nvme(1) from the nvme-cli project to read status from
|
||||
the NVMe devices. This requires root access.
|
||||
|
||||
[nvme]
|
||||
user root
|
||||
|
||||
The plugin does not support alerting.
|
||||
|
||||
=head1 INTERPRETATION
|
||||
|
||||
This is a multigraph plugin which makes three graphs
|
||||
|
||||
=head2 nvme_usage
|
||||
|
||||
This reports how much of capacity is allocated in each NVMe
|
||||
"namespace". The report is in percent. This number may not have much
|
||||
relation to actual use, e.g., if deleted data areas have not been
|
||||
trimmed/discarded.
|
||||
|
||||
=head2 nvme_bytes
|
||||
|
||||
This reports read and write activity on each NVMe device, in bytes per
|
||||
second. Ideally there should be much more read than write. If they
|
||||
are symmetrical, you are using your NVMe as a very expensive FIFO, and
|
||||
if you write more than you read, you should probably look for archival
|
||||
storage instead.
|
||||
|
||||
It is a good idea to compare these numbers to I/O counters from
|
||||
diskstats. If they are much higher, look into if the write
|
||||
amplification can be due to suboptimal I/O request sizes.
|
||||
|
||||
=head2 nvme_writecycles
|
||||
|
||||
This graphs is intended to give an indication of how much life there
|
||||
is left in your NVMe. It calculates the number of bytes written
|
||||
during each device's lifetime against the capacity of the device,
|
||||
thereby getting an average number of write cycle each cell has
|
||||
experienced.
|
||||
|
||||
A prosumer NVMe will handle a few thousand writes to each cell before
|
||||
the error rate gets out of hand.
|
||||
|
||||
=head1 MAGIC MARKERS
|
||||
|
||||
#%# family=auto
|
||||
#%# capabilities=autoconf
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
None known.
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
1.0
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Kjetil Torgrim Homme <kjetil.homme@redpill-linpro.com>
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
GPLv2
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use Munin::Plugin;
|
||||
|
||||
# Check that multigraph is supported
|
||||
need_multigraph();
|
||||
|
||||
sub run_nvme {
|
||||
my (@cmd) = @_;
|
||||
my @lines;
|
||||
$ENV{'LC_ALL'} = 'C';
|
||||
if (open(my $nvme, '-|', 'nvme', @cmd)) {
|
||||
@lines = <$nvme>;
|
||||
close($nvme);
|
||||
} else {
|
||||
# Perl printed a warning about failed exec already. Ignore
|
||||
# error and return nothing.
|
||||
}
|
||||
@lines;
|
||||
}
|
||||
|
||||
sub human_to_bytes {
|
||||
my ($str) = @_;
|
||||
my %units = (
|
||||
kB => 1000,
|
||||
MB => 1000_000,
|
||||
GB => 1000_000_000,
|
||||
TB => 1000_000_000_000,
|
||||
PB => 1000_000_000_000_000, # I wish I had need for this
|
||||
);
|
||||
$str =~ /(\d+(\.\d+)?)\s+(.B)/;
|
||||
int($1 * $units{$3});
|
||||
}
|
||||
|
||||
sub nvme_list {
|
||||
# Node SN Model Namespace Usage Format FW Rev
|
||||
# ---------------- -------------------- ---------------------------------------- --------- -------------------------- ---------------- --------
|
||||
# /dev/nvme1n1 S464NB0K601188N Samsung SSD 970 EVO 2TB 1 695.50 GB / 2.00 TB 512 B + 0 B 1B2QEXE7
|
||||
my %devices;
|
||||
for (run_nvme('list')) {
|
||||
if (m:^(/\S+)\s+(\S+)\s+(\S.*\S)\s{3,}(\d+)\s+(\S+\s+.B)\s+/\s+(\S+\s+.B):) {
|
||||
$devices{$2} = {
|
||||
device => $1,
|
||||
sn => $2,
|
||||
model => $3,
|
||||
namespace => $4,
|
||||
usage => human_to_bytes($5),
|
||||
capacity => human_to_bytes($6),
|
||||
};
|
||||
}
|
||||
}
|
||||
\%devices;
|
||||
}
|
||||
|
||||
sub smart_log {
|
||||
my ($dev) = @_;
|
||||
my %info;
|
||||
for (run_nvme('smart-log', $dev)) {
|
||||
next if /^Smart Log/;
|
||||
if (/(.*?)\s+:\s+(.*)/) {
|
||||
my ($var, $value) = ($1, $2);
|
||||
$var =~ s/\s/_/g;
|
||||
if ($value =~ /^\d+(,\d\d\d)+$/) {
|
||||
$value =~ s/,//g;
|
||||
}
|
||||
$info{lc $var} = $value;
|
||||
}
|
||||
}
|
||||
return \%info;
|
||||
}
|
||||
|
||||
use Data::Dumper;
|
||||
|
||||
my $mode = ($ARGV[0] or "print");
|
||||
|
||||
my $list = nvme_list();
|
||||
if ($mode eq 'autoconf') {
|
||||
if (keys %{$list}) {
|
||||
print "yes\n";
|
||||
} else {
|
||||
print "no (no devices to monitor)\n";
|
||||
}
|
||||
exit 0;
|
||||
}
|
||||
|
||||
my @sn = sort keys %{$list};
|
||||
|
||||
if ($mode eq 'config') {
|
||||
my $sn_list = join(' ', @sn);
|
||||
|
||||
print <<'EOF';
|
||||
multigraph nvme_usage
|
||||
graph_title NVME Namespace Usage
|
||||
graph_order $sn_list
|
||||
graph_vlabel Percent used
|
||||
graph_scale no
|
||||
graph_category disk
|
||||
graph_info How much space is used
|
||||
EOF
|
||||
for (@sn) {
|
||||
print <<"EOF";
|
||||
$_.label $list->{$_}->{device} used
|
||||
$_.type GAUGE
|
||||
$_.max 100
|
||||
$_.min 0
|
||||
EOF
|
||||
}
|
||||
print <<'EOF';
|
||||
multigraph nvme_bytes
|
||||
graph_title NVME Bytes Read / Written
|
||||
graph_order $sn_list
|
||||
graph_vlabel bytes read (-) / written (+) per ${graph_period}'
|
||||
graph_category disk
|
||||
graph_info How much data is read and written
|
||||
graph_period second
|
||||
EOF
|
||||
for (@sn) {
|
||||
print <<"EOF";
|
||||
${_}_r.label $list->{$_}->{device}
|
||||
${_}_r.type DERIVE
|
||||
${_}_r.min 0
|
||||
${_}_r.graph no
|
||||
${_}_w.label $list->{$_}->{device}
|
||||
${_}_w.type DERIVE
|
||||
${_}_w.min 0
|
||||
${_}_w.negative ${_}_r
|
||||
EOF
|
||||
}
|
||||
print <<'EOF';
|
||||
multigraph nvme_writecycles
|
||||
graph_title NVME Write Cycles
|
||||
graph_order $sn_list
|
||||
graph_vlabel Cycles
|
||||
graph_args --logarithmic
|
||||
graph_category disk
|
||||
graph_info How much data has been written in lifetime divided by capacity
|
||||
EOF
|
||||
for (@sn) {
|
||||
print <<"EOF";
|
||||
$_.label $list->{$_}->{device} write cycles
|
||||
$_.type GAUGE
|
||||
$_.min 0
|
||||
EOF
|
||||
}
|
||||
} else {
|
||||
for (@sn) {
|
||||
$list->{$_}->{smart} = smart_log($list->{$_}->{device});
|
||||
}
|
||||
print "multigraph nvme_usage\n";
|
||||
for (@sn) {
|
||||
my $info = $list->{$_};
|
||||
my $used = 100 * $info->{usage} / $info->{capacity};
|
||||
print "$_.value $used\n";
|
||||
}
|
||||
print "multigraph nvme_bytes\n";
|
||||
for (@sn) {
|
||||
my $info = $list->{$_};
|
||||
my $rbytes = $info->{smart}->{data_units_read};
|
||||
my $wbytes = $info->{smart}->{data_units_written};
|
||||
print "${_}_r.value $rbytes\n";
|
||||
print "${_}_w.value $wbytes\n";
|
||||
}
|
||||
print "multigraph nvme_writecycles\n";
|
||||
for (@sn) {
|
||||
my $info = $list->{$_};
|
||||
|
||||
# The unit size reported is 1000 blocks.
|
||||
my $cycles = $info->{smart}->{data_units_read} * 512_000 / $info->{capacity};
|
||||
print "$_.value $cycles\n";
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue