1
0
Fork 0
mirror of https://github.com/munin-monitoring/contrib.git synced 2025-07-22 02:51:03 +00:00

Merge pull request #901 from Cyclenerd/nvidia_gpu

Nvidia GPU utilization
This commit is contained in:
sumpfralle 2018-02-24 14:34:12 +01:00 committed by GitHub
commit 0b07e636e2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -37,8 +37,7 @@ C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
=item * =item *
Add support for specific professional GPU features such as number of compute Add support for specific professional GPU features such as number of compute processes, clocks and so on.
processes, clocks, power draw, utilization, and so on.
=item * =item *
@ -64,7 +63,7 @@ faken@fakenmc.com
=cut =cut
# Determine name of parameter to monitor # Determine name of parameter to monitor
name=`basename $0 | sed 's/^nvidia_gpu_//g'` name=$(basename "$0" | sed 's/^nvidia_gpu_//g')
# Get location of nvidia-smi executable or use default # Get location of nvidia-smi executable or use default
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'} nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
@ -72,7 +71,7 @@ nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
# Check if autoconf was requested # Check if autoconf was requested
if [ "$1" = "autoconf" ]; then if [ "$1" = "autoconf" ]; then
# Autoconf only returns yes if nvidia-smi exists and is executable # Autoconf only returns yes if nvidia-smi exists and is executable
if [ -x $nvSmiExec ]; then if [ -x "$nvSmiExec" ]; then
echo yes echo yes
exit 0 exit 0
else else
@ -87,81 +86,82 @@ if [ "$1" = "suggest" ]; then
echo "mem" echo "mem"
echo "fan" echo "fan"
echo "power" echo "power"
echo "utilization"
exit 0 exit 0
fi fi
# Get number of GPUs # Get number of GPUs
nGpusOutput=`$nvSmiExec -L` nGpusOutput=$("$nvSmiExec" -L)
nGpus=`echo "$nGpusOutput" | wc -l` nGpus=$(echo "$nGpusOutput" | wc -l)
if [ $nGpus -eq 0 ]; then if [ "$nGpus" -eq 0 ]; then
# Exit if no GPUs found # Exit if no GPUs found
echo "No NVIDIA GPUs detected. Exiting." echo "No NVIDIA GPUs detected. Exiting."
exit 1 exit 1
fi fi
# Get full output from nvidia-smi # Get full output from nvidia-smi
smiOutput=`$nvSmiExec -q` smiOutput=$("$nvSmiExec" -q)
# Check if config was requested # Check if config was requested
if [ "$1" = "config" ]; then if [ "$1" = "config" ]; then
# Get driver version # Get driver version
driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '` driverVersion=$(echo "$smiOutput" | grep "Driver Version" | cut -d : -f 2 | tr -d ' ')
# Configure graph depending on what which quantity will be plotted # Configure graph depending on what which quantity will be plotted
case $name in case $name in
temp) temp)
echo 'graph_title GPU temperature' echo 'graph_title GPU temperature'
echo 'graph_args -l 0 -u 120' echo 'graph_args -l 0 -u 120'
echo 'graph_vlabel Degrees (C)' echo 'graph_vlabel degrees Celsius'
echo 'graph_category sensors' echo 'graph_category sensors'
echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0 nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ] while [ $nGpusCounter -lt "$nGpus" ]
do do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "temp${nGpusCounter}.warning ${warning:-75}" echo "${name}${nGpusCounter}.warning ${warning:-75}"
echo "temp${nGpusCounter}.critical ${critical:-95}" echo "${name}${nGpusCounter}.critical ${critical:-95}"
echo "temp${nGpusCounter}.info Temperature information for $gpuName" echo "${name}${nGpusCounter}.info Temperature information for $gpuName"
: $(( nGpusCounter = $nGpusCounter + 1 )) : $((nGpusCounter=nGpusCounter+1))
done done
;; ;;
mem) mem)
# First determine total memory of each GPU... # First determine total memory of each GPU...
gpusTotalMemOutput=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '` gpusTotalMemOutput=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' ')
gpusTotalMem='' gpusTotalMem=''
nGpusCounter=0 nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ] while [ $nGpusCounter -lt "$nGpus" ]
do do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "mem${nGpusCounter}.info Memory information for $gpuName" echo "${name}${nGpusCounter}.info Memory information for $gpuName"
gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p` gpuMem=$(echo "$gpusTotalMemOutput"| sed -n $((nGpusCounter+1))p)
gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}" gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
: $(( nGpusCounter = $nGpusCounter + 1 )) : $((nGpusCounter=nGpusCounter+1))
if [ $nGpusCounter -lt $nGpus ]; then if [ "$nGpusCounter" -lt "$nGpus" ]; then
gpusTotalMem="${gpusTotalMem}, " gpusTotalMem="${gpusTotalMem}, "
fi fi
done done
# ...then output config data. # ...then output config data.
echo 'graph_title GPU memory usage' echo 'graph_title GPU memory usage'
echo 'graph_args -l 0 -u 100' echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel Percentage' echo 'graph_vlabel %'
echo 'graph_category memory' echo 'graph_category memory'
echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
;; ;;
fan) fan)
echo 'graph_title GPU fan speed' echo 'graph_title GPU fan speed'
echo 'graph_args -l 0 -u 100' echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel Percentage' echo 'graph_vlabel %'
echo 'graph_category sensors' echo 'graph_category sensors'
echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0 nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ] while [ $nGpusCounter -lt "$nGpus" ]
do do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "fan${nGpusCounter}.info Fan information for $gpuName" echo "${name}${nGpusCounter}.info Fan information for $gpuName"
: $(( nGpusCounter = $nGpusCounter + 1 )) : $((nGpusCounter=nGpusCounter+1))
done done
;; ;;
power) power)
echo 'graph_title GPU power consumption' echo 'graph_title GPU power consumption'
@ -169,13 +169,27 @@ if [ "$1" = "config" ]; then
echo 'graph_category sensors' echo 'graph_category sensors'
echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion" echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0 nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ] while [ $nGpusCounter -lt "$nGpus" ]
do do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "power${nGpusCounter}.info power consumption of $gpuName" echo "${name}${nGpusCounter}.info power consumption of $gpuName"
: $(( nGpusCounter = $nGpusCounter + 1 )) : $((nGpusCounter=nGpusCounter+1))
done done
;; ;;
utilization)
echo 'graph_title GPU utilization'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel %'
echo 'graph_category system'
echo "graph_info GPU utilization of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.info GPU utilization information for $gpuName"
: $((nGpusCounter=nGpusCounter+1))
done
;;
*) *)
echo "Can't run without a proper symlink. Exiting." echo "Can't run without a proper symlink. Exiting."
echo "Try running munin-node-configure --suggest." echo "Try running munin-node-configure --suggest."
@ -185,11 +199,11 @@ if [ "$1" = "config" ]; then
# Common stuff for all quantities # Common stuff for all quantities
nGpusCounter=0 nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ] while [ $nGpusCounter -lt "$nGpus" ]
do do
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.label $gpuName" echo "${name}${nGpusCounter}.label $gpuName"
: $(( nGpusCounter = $nGpusCounter + 1 )) : $((nGpusCounter=nGpusCounter+1))
#print_warning $name #print_warning $name
#print_critical $name #print_critical $name
done done
@ -200,27 +214,30 @@ fi
# Get requested value # Get requested value
case $name in case $name in
temp) temp)
valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2` valueGpus=$(echo "$smiOutput" | grep -A 1 "Temperature" | grep -i "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2)
;; ;;
mem) mem)
totalMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2` totalMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2)
usedMemGpus=`echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2` usedMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2)
valueGpus='' valueGpus=''
nGpusCounter=0 nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ] while [ $nGpusCounter -lt "$nGpus" ]
do do
totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` totalMemGpu=$(echo "$totalMemGpus" | sed -n $((nGpusCounter+1))p)
usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` usedMemGpu=$(echo "$usedMemGpus" | sed -n $((nGpusCounter+1))p)
percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu )) percentMemUsed=$((usedMemGpu*100/totalMemGpu))
valueGpus="${valueGpus}${percentMemUsed}"$'\n' valueGpus="${valueGpus}${percentMemUsed}"$'\n'
: $(( nGpusCounter = $nGpusCounter + 1 )) : $((nGpusCounter=nGpusCounter+1))
done done
;; ;;
fan) fan)
valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2` valueGpus=$(echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2)
;; ;;
power) power)
valueGpus=`echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2` valueGpus=$(echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2)
;;
utilization)
valueGpus=$(echo "$smiOutput" | grep "Gpu" | cut -d ':' -f 2 | cut -d ' ' -f 2)
;; ;;
*) *)
echo "Can't run without a proper symlink. Exiting." echo "Can't run without a proper symlink. Exiting."
@ -232,12 +249,9 @@ case $name in
# Print requested value # Print requested value
nGpusCounter=0 nGpusCounter=0
while [ $nGpusCounter -lt $nGpus ] while [ $nGpusCounter -lt "$nGpus" ]
do do
value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p` value=$(echo "$valueGpus" | sed -n $((nGpusCounter+1))p)
echo "${name}${nGpusCounter}.value $value" echo "${name}${nGpusCounter}.value $value"
: $(( nGpusCounter = $nGpusCounter + 1 )) : $((nGpusCounter=nGpusCounter+1))
done done