mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-24 09:57:09 +00:00
xorg-x11-drv-nvidia-340.29-1.el6.x86_64 (CUDA 6.5?) seems to have changed the Temperature lines from "Gpu" to "GPU" prefixes. Case insensitive grep preserves backwards comatability.
226 lines
6.1 KiB
Bash
Executable file
226 lines
6.1 KiB
Bash
Executable file
#!/bin/sh
|
|
# -*- sh -*-
|
|
|
|
: << =cut
|
|
|
|
=head1 NAME
|
|
|
|
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
|
|
usually bundled with NVIDIA GPU driver, to obtain information.
|
|
|
|
=head1 CONFIGURATION
|
|
|
|
This is a wildcard plugin. The wildcard prefix link name should be the
|
|
value to monitor.
|
|
|
|
This plugin uses the following configuration variables:
|
|
|
|
[nvidia_gpu_*]
|
|
env.smiexec - Location of nvidia-smi executable.
|
|
env.warning - Warning temperature
|
|
env.critical - Critical temperature
|
|
|
|
=head2 DEFAULT CONFIGURATION
|
|
|
|
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and
|
|
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.
|
|
|
|
=head2 EXAMPLE WILDCARD USAGE
|
|
|
|
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
|
|
|
|
...will monitor the temperature of available GPUs.
|
|
|
|
=head1 TODO
|
|
|
|
=over 4
|
|
|
|
=item *
|
|
|
|
Add support for specific professional GPU features such as number of compute
|
|
processes, clocks, power draw, utilization, and so on.
|
|
|
|
=item *
|
|
|
|
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).
|
|
|
|
=back
|
|
|
|
=head1 AUTHOR
|
|
|
|
Nuno Fachada
|
|
faken@fakenmc.com
|
|
|
|
=head1 LICENSE
|
|
|
|
GNU General Public License, version 2
|
|
http://www.gnu.org/licenses/gpl-2.0.html
|
|
|
|
=head1 MAGIC MARKERS
|
|
|
|
#%# family=auto
|
|
#%# capabilities=autoconf suggest
|
|
|
|
=cut
|
|
|
|
# Determine name of parameter to monitor
|
|
name=`basename $0 | sed 's/^nvidia_gpu_//g'`
|
|
|
|
# Get location of nvidia-smi executable or use default
|
|
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
|
|
|
|
# Check if autoconf was requested
|
|
if [ "$1" = "autoconf" ]; then
|
|
# Autoconf only returns yes if nvidia-smi exists and is executable
|
|
if [ -x $nvSmiExec ]; then
|
|
echo yes
|
|
exit 0
|
|
else
|
|
echo "no (nvidia-smi executable not found)"
|
|
exit 0
|
|
fi
|
|
fi
|
|
|
|
# Check if suggest was requested
|
|
if [ "$1" = "suggest" ]; then
|
|
echo "temp"
|
|
echo "mem"
|
|
echo "fan"
|
|
exit 0
|
|
fi
|
|
|
|
# Get number of GPUs
|
|
nGpusOutput=`$nvSmiExec -L`
|
|
nGpus=`echo "$nGpusOutput" | wc -l`
|
|
if [ $nGpus -eq 0 ]; then
|
|
# Exit if no GPUs found
|
|
echo "No NVIDIA GPUs detected. Exiting."
|
|
exit 1
|
|
fi
|
|
|
|
# Get full output from nvidia-smi
|
|
smiOutput=`$nvSmiExec -q`
|
|
|
|
# Check if config was requested
|
|
if [ "$1" = "config" ]; then
|
|
|
|
# Get driver version
|
|
driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '`
|
|
|
|
# Configure graph depending on what which quantity will be plotted
|
|
case $name in
|
|
temp)
|
|
echo 'graph_title GPU temperature'
|
|
echo 'graph_args -l 0 -u 120'
|
|
echo 'graph_vlabel Degrees (C)'
|
|
echo 'graph_category gpu'
|
|
echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
|
|
nGpusCounter=0
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
do
|
|
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
|
echo "temp${nGpusCounter}.warning ${warning:-75}"
|
|
echo "temp${nGpusCounter}.critical ${critical:-95}"
|
|
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
|
done
|
|
;;
|
|
mem)
|
|
# First determine total memory of each GPU...
|
|
gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '`
|
|
gpusTotalMem=''
|
|
nGpusCounter=0
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
do
|
|
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
|
echo "mem${nGpusCounter}.info Memory information for $gpuName"
|
|
gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p`
|
|
gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
|
if [ $nGpusCounter -lt $nGpus ]; then
|
|
gpusTotalMem="${gpusTotalMem}, "
|
|
fi
|
|
done
|
|
# ...then output config data.
|
|
echo 'graph_title GPU memory usage'
|
|
echo 'graph_args -l 0 -u 100'
|
|
echo 'graph_vlabel Percentage'
|
|
echo 'graph_category gpu'
|
|
echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
|
|
;;
|
|
fan)
|
|
echo 'graph_title GPU fan speed'
|
|
echo 'graph_args -l 0 -u 100'
|
|
echo 'graph_vlabel Percentage'
|
|
echo 'graph_category gpu'
|
|
echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
|
|
nGpusCounter=0
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
do
|
|
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
|
echo "fan${nGpusCounter}.info Fan information for $gpuName"
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
|
done
|
|
;;
|
|
*)
|
|
echo "Can't run without a proper symlink. Exiting."
|
|
echo "Try running munin-node-configure --suggest."
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
# Common stuff for all quantities
|
|
nGpusCounter=0
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
do
|
|
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
|
echo "${name}${nGpusCounter}.label $gpuName"
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
|
#print_warning $name
|
|
#print_critical $name
|
|
done
|
|
|
|
exit 0
|
|
fi
|
|
|
|
# Get requested value
|
|
case $name in
|
|
temp)
|
|
valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2`
|
|
;;
|
|
mem)
|
|
totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2`
|
|
usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2`
|
|
valueGpus=''
|
|
nGpusCounter=0
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
do
|
|
totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
|
|
usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p`
|
|
percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu ))
|
|
valueGpus="${valueGpus}${percentMemUsed}"$'\n'
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
|
done
|
|
;;
|
|
fan)
|
|
valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2`
|
|
;;
|
|
*)
|
|
echo "Can't run without a proper symlink. Exiting."
|
|
echo "Try running munin-node-configure --suggest."
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
|
|
# Print requested value
|
|
nGpusCounter=0
|
|
while [ $nGpusCounter -lt $nGpus ]
|
|
do
|
|
value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p`
|
|
echo "${name}${nGpusCounter}.value $value"
|
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
|
done
|
|
|
|
|
|
|