diff --git a/plugins/other/nvidia_smi_ b/plugins/other/nvidia_smi_ new file mode 100755 index 00000000..f3290a2c --- /dev/null +++ b/plugins/other/nvidia_smi_ @@ -0,0 +1,137 @@ +#!/usr/bin/perl -w +# -*- perl -*- +# +# Script to monitor NVIDIA Graphics Card. +# +# Parameters understood: +# +# config (required) +# autoconf (optional - used by munin-config) +# +# Magic markers (optional - used by munin-config and installation +# scripts): +#%# family=auto +#%# capabilities=autoconf suggest + +use strict; +use XML::Simple; + +my $nvidia_smi = $ENV{nvidia_smi} || "/usr/bin/nvidia-smi"; + + +## Munin autoconf method. +if (exists $ARGV[0] and $ARGV[0] eq "autoconf" ) { + if (! (-e $nvidia_smi)){ + printf "no (file $nvidia_smi does not exists)\n"; + exit 0; + } + # Now see if "nvidia-smi" can run + if (! (-x $nvidia_smi)){ + printf "no (file $nvidia_smi exists, but not executable)\n"; + exit 0; + } + + my $text = `$nvidia_smi -a 2>/dev/null | grep GPU`; + if ($?) { + print "no (No GPUs found. Check '$nvidia_smi -a' output)\n"; + exit 0; + } + + print "yes\n"; + exit 0; +} + + +## Munin suggest method. +if (defined $ARGV[0] and $ARGV[0] eq 'suggest') { +# FIXME: SHould be done in pure-perl + my $gpus = `$nvidia_smi -a | egrep ^GPU | sed -e "s/ //g" | sed -e "s/://g" | tr [:upper:] [:lower:]`; + print $gpus if defined $gpus; #FIXME + exit 0; +} + +$0 =~ /nvidia_smi_gpu(.+)*$/; +my $gpu_id = $1; +exit 2 unless defined $gpu_id; + +# Get XML with sensor values for GPU with particular ID +my $data = `$nvidia_smi -g $gpu_id -x` or die "Could not run $nvidia_smi: $!\n"; + +# Parse XML into easy accessable hash-tree +my $ref = XMLin($data); +my %gpu = (); # Will contain values cleaned form percent and Celsius signs + +if ( exists $ref->{gpu}->{temp} ){ + $ref->{gpu}->{temp} =~ /^(.+) C$/; + $gpu{temp} = $1; +} + +if ( exists $ref->{gpu}->{fan_speed} ){ + $ref->{gpu}->{fan_speed} =~ /^(.+)\%$/; + $gpu{fan} = $1; +} + +if ( exists $ref->{gpu}->{utilization}->{gpu_util} ){ + $ref->{gpu}->{utilization}->{gpu_util} =~ /^(.+)\%$/; + $gpu{util} = $1; +} + +if ( exists $ref->{gpu}->{utilization}->{memory_util} ){ + $ref->{gpu}->{utilization}->{memory_util} =~ /^(.+)\%$/; + $gpu{mem} = $1; +} + +$gpu{model} = $ref->{gpu}->{prod_name} if exists $ref->{gpu}->{prod_name}; +$gpu{driver} = $ref->{driver_version} if exists $ref->{driver_version}; + +my $card_model = $gpu{model} || ""; +my $driver_version = $gpu{driver} || ""; + +## Munin config method. +if (exists $ARGV[0] and $ARGV[0] eq "config") { + print "graph_title $card_model sensors\n"; + print "graph_args --base 1000\n"; + print "graph_args --upper-limit 100 -l 0\n"; + print "graph_category sensors\n"; + print "graph_vlabel % or C\n"; + print "graph_info This graph shows information about your $card_model graphics card running driver version $driver_version.\n"; + + if (exists $gpu{temp}) { + print "gpu_temp.label GPU Temperature (C)\n"; + print "gpu_temp.info GPU temperature sensor\n"; + print "gpu_temp.draw LINE2\n"; + print "gpu_temp.warning :80\n"; + print "gpu_temp.critical :100\n"; + } + + if (exists $gpu{mem}) { + print "gpu_mem.label Memory consumption (%)\n"; + print "gpu_mem.info How much of on-board memory is used\n"; + print "gpu_mem.draw LINE2\n"; + print "gpu_mem.warning :85\n"; + print "gpu_mem.critical :95\n"; + } + + if (exists $gpu{util}) { + print "gpu_util.label GPU Utilization (%)\n"; + print "gpu_util.info How much computational resourses are used\n"; + print "gpu_util.draw LINE2\n"; + } + + if (exists $gpu{fan}) { + print "gpu_fan.label Fan Speed (%)\n"; + print "gpu_fan.info Fan RPM in precent of maximum\n"; + print "gpu_fan.draw LINE2\n"; + print "gpu_fan.warning :80\n"; + print "gpu_fan.critical :95\n"; + } + + exit 0; +} + + +print "gpu_temp.value ",$gpu{temp},"\n" if exists $gpu{temp}; +print "gpu_mem.value ", $gpu{mem}, "\n" if exists $gpu{mem}; +print "gpu_util.value ",$gpu{util},"\n" if exists $gpu{util}; +print "gpu_fan.value ", $gpu{fan}, "\n" if exists $gpu{fan}; +