From 426bba44662c79b56811863c57a9a16aa597a8ed Mon Sep 17 00:00:00 2001 From: Nuno Fachada Date: Tue, 11 Dec 2012 19:57:06 +0000 Subject: [PATCH] Add plugins for monitoring NVIDIA and AMD GPUs --- plugins/gpu/amd_gpu_ | 235 ++++++++++++++++++++++++++++++++++++++++ plugins/gpu/nvidia_gpu_ | 208 +++++++++++++++++++++++++++++++++++ 2 files changed, 443 insertions(+) create mode 100755 plugins/gpu/amd_gpu_ create mode 100755 plugins/gpu/nvidia_gpu_ diff --git a/plugins/gpu/amd_gpu_ b/plugins/gpu/amd_gpu_ new file mode 100755 index 00000000..bd4c8af1 --- /dev/null +++ b/plugins/gpu/amd_gpu_ @@ -0,0 +1,235 @@ +#!/bin/bash +# -*- bash -*- + +: << =cut + +=head1 NAME + +amd_gpu_ - Wildcard plugin to monitor AMD GPUs. Uses aticonfig utility, +usually bundled with AMD GPU driver, to obtain information. To use this +plugin you have to make sure aticonfig will run without an active X +server (i.e. without anyone being logged in via the GUI). For more +information on this visit this link: +http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/ + +=head1 CONFIGURATION + +This is a wildcard plugin. The wildcard prefix link name should be the +value to monitor. + +This plugin uses the following configuration variables: + + [amd_gpu_*] + env.aticonfexec - Location of aticonfig executable. + user root + +=head2 DEFAULT CONFIGURATION + +The default configuration is to set "env.aticonfexec" to /usr/bin/aticonfig. + +=head2 EXAMPLE WILDCARD USAGE + +C + +...will monitor the temperature of available AMD GPUs. + +=head1 AUTHOR + +Nuno Fachada +faken@fakenmc.com + +=head1 LICENSE + + GNU General Public License, version 2 + http://www.gnu.org/licenses/gpl-2.0.html + +=head1 MAGIC MARKERS + + #%# family=auto + #%# capabilities=autoconf suggest + +=cut + +# Determine name of parameter to monitor +name=`basename $0 | sed 's/^amd_gpu_//g'` + +# Get location of aticonfig executable or use default +atiConfigExec=${aticonfexec:-'/usr/bin/aticonfig'} + +# Check if autoconf was requested +if [ "$1" = "autoconf" ]; then + # Autoconf only returns yes if aticonfig exists and is executable + if [ -x $atiConfigExec ]; then + echo yes + exit 0 + else + echo "no (aticonfig executable not found)" + exit 0 + fi +fi + +# Check if suggest was requested +if [ "$1" = "suggest" ]; then + echo "temp" + echo "clocks" + echo "fan" + echo "load" + echo "vcore" + exit 0 +fi + +# Get number of GPUs +nGpusOutput=`$atiConfigExec --list-adapters` + +nGpus=`echo "$nGpusOutput" | wc -l` +nGpus=$((nGpus - 2)) # Last two lines don't matter +if [ $nGpus -eq 0 ]; then + # Exit if no GPUs found + echo "No AMD GPUs detected. Exiting." + exit 1 +fi + +# Check if config was requested +if [ "$1" = "config" ]; then + + # Configure graph depending on what which quantity will be plotted + case $name in + temp) + echo 'graph_title GPU temperature' + echo 'graph_args -l 0 -u 120' + echo 'graph_vlabel Degrees (C)' + echo 'graph_category gpu' + echo "graph_info Temperature information for AMD GPUs" + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "` + echo "temp${nGpusCounter}.warning 75" + echo "temp${nGpusCounter}.critical 95" + echo "temp${nGpusCounter}.info Temperature information for $gpuName" + echo "temp${nGpusCounter}.label Temperature ($gpuName)" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + clocks) + # First determine max clock for each GPU... + read -a array <<< `$atiConfigExec --odgc | grep "Peak Range" | grep -o "[0-9]*"` + maxclock=0 + for element in "${array[@]}" + do + if [ "$element" -gt "$maxclock" ]; then + maxclock=$element + fi + done + # ...then output config data. + echo 'graph_title GPU clock' + echo "graph_args -l 0 -u $maxclock" + echo 'graph_vlabel MHz' + echo 'graph_category gpu' + echo "graph_info Core and memory clock info for AMD GPUs" + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "` + echo "memclock${nGpusCounter}.info Memory clock information for $gpuName" + echo "memclock${nGpusCounter}.label Memory clock ($gpuName)" + echo "coreclock${nGpusCounter}.info Core clock information for $gpuName" + echo "coreclock${nGpusCounter}.label Core clock ($gpuName)" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + fan) + echo 'graph_title GPU fan speed' + echo 'graph_args -l 0 -u 100' + echo 'graph_vlabel Percentage' + echo 'graph_category gpu' + echo "graph_info Fan speed of AMD GPUs" + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "` + echo "fan${nGpusCounter}.info Fan speed information for $gpuName" + echo "fan${nGpusCounter}.label Fan speed ($gpuName)" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + load) + echo 'graph_title GPU load' + echo 'graph_args -l 0 -u 100' + echo 'graph_vlabel Percentage' + echo 'graph_category gpu' + echo "graph_info GPU load" + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "` + echo "load${nGpusCounter}.info Load information for $gpuName" + echo "load${nGpusCounter}.label Load ($gpuName)" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + vcore) + echo 'graph_title GPU core voltage' + echo 'graph_vlabel mV' + echo 'graph_category gpu' + echo "graph_info GPU core voltage" + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "` + echo "vcore${nGpusCounter}.info Vcore information for $gpuName" + echo "vcore${nGpusCounter}.label Core voltage ($gpuName)" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + *) + echo "Can't run without a proper symlink. Exiting." + echo "Try running munin-node-configure --suggest." + exit 1 + ;; + esac + + exit 0 +fi + +# Get and print requested value for all available GPUs +export DISPLAY=:0 +nGpusCounter=0 +while [ $nGpusCounter -lt $nGpus ] +do + case $name in + temp) + value=`$atiConfigExec --adapter=$nGpusCounter --odgt | grep "Sensor 0: Temperature" | grep -o "[0-9]*\.[0-9]*"` + echo "temp${nGpusCounter}.value $value" + ;; + clocks) + value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "Current Clocks" | grep -o "[0-9]*"` + coreClock=`echo "$value" | sed -n 1p` + echo "coreclock${nGpusCounter}.value $coreClock" + memClock=`echo "$value" | sed -n 2p` + echo "memclock${nGpusCounter}.value $memClock" + ;; + fan) + value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get fanspeed 0" | grep "Fan Speed" | grep -o "[0-9]*"` + echo "fan${nGpusCounter}.value $value" + ;; + load) + value=`$atiConfigExec --adapter=$nGpusCounter --odgc | grep "GPU load" | grep -o "[0-9]*"` + echo "load${nGpusCounter}.value $value" + ;; + vcore) + value=`$atiConfigExec --adapter=$nGpusCounter --pplib-cmd "get activity" | grep "VDDC" | grep -o "[0-9]*"` + echo "vcore${nGpusCounter}.value $value" + ;; + *) + echo "Can't run without a proper symlink. Exiting." + echo "Try running munin-node-configure --suggest." + exit 1 + ;; + esac + : $(( nGpusCounter = $nGpusCounter + 1 )) +done + + + + diff --git a/plugins/gpu/nvidia_gpu_ b/plugins/gpu/nvidia_gpu_ new file mode 100755 index 00000000..a43f5989 --- /dev/null +++ b/plugins/gpu/nvidia_gpu_ @@ -0,0 +1,208 @@ +#!/bin/sh +# -*- sh -*- + +: << =cut + +=head1 NAME + +nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility, +usually bundled with NVIDIA GPU driver, to obtain information. + +=head1 CONFIGURATION + +This is a wildcard plugin. The wildcard prefix link name should be the +value to monitor. + +This plugin uses the following configuration variables: + + [nvidia_gpu_*] + env.smiexec - Location of nvidia-smi executable. + +=head2 DEFAULT CONFIGURATION + +The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi. + +=head2 EXAMPLE WILDCARD USAGE + +C + +...will monitor the temperature of available GPUs. + +=head1 AUTHOR + +Nuno Fachada +faken@fakenmc.com + +=head1 LICENSE + + GNU General Public License, version 2 + http://www.gnu.org/licenses/gpl-2.0.html + +=head1 MAGIC MARKERS + + #%# family=auto + #%# capabilities=autoconf suggest + +=cut + +# Determine name of parameter to monitor +name=`basename $0 | sed 's/^nvidia_gpu_//g'` + +# Get location of nvidia-smi executable or use default +nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'} + +# Check if autoconf was requested +if [ "$1" = "autoconf" ]; then + # Autoconf only returns yes if nvidia-smi exists and is executable + if [ -x $nvSmiExec ]; then + echo yes + exit 0 + else + echo "no (nvidia-smi executable not found)" + exit 0 + fi +fi + +# Check if suggest was requested +if [ "$1" = "suggest" ]; then + echo "temp" + echo "mem" + echo "fan" + exit 0 +fi + +# Get number of GPUs +nGpusOutput=`$nvSmiExec -L` +nGpus=`echo "$nGpusOutput" | wc -l` +if [ $nGpus -eq 0 ]; then + # Exit if no GPUs found + echo "No NVIDIA GPUs detected. Exiting." + exit 1 +fi + +# Get full output from nvidia-smi +smiOutput=`$nvSmiExec -q` + +# Check if config was requested +if [ "$1" = "config" ]; then + + # Get driver version + driverVersion=`nvidia-smi -q | grep "Driver Version" | cut -d : -f 2 | tr -d ' '` + + # Configure graph depending on what which quantity will be plotted + case $name in + temp) + echo 'graph_title GPU temperature' + echo 'graph_args -l 0 -u 120' + echo 'graph_vlabel Degrees (C)' + echo 'graph_category gpu' + echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion" + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` + echo "temp${nGpusCounter}.warning 75" + echo "temp${nGpusCounter}.critical 95" + echo "temp${nGpusCounter}.info Temperature information for $gpuName" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + mem) + # First determine total memory of each GPU... + gpusTotalMemOutput=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' '` + gpusTotalMem='' + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` + echo "mem${nGpusCounter}.info Memory information for $gpuName" + gpuMem=`echo "$gpusTotalMemOutput"| sed -n $(( $nGpusCounter + 1 ))p` + gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}" + : $(( nGpusCounter = $nGpusCounter + 1 )) + if [ $nGpusCounter -lt $nGpus ]; then + gpusTotalMem="${gpusTotalMem}, " + fi + done + # ...then output config data. + echo 'graph_title GPU memory usage' + echo 'graph_args -l 0 -u 100' + echo 'graph_vlabel Percentage' + echo 'graph_category gpu' + echo "graph_info Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)" + ;; + fan) + echo 'graph_title GPU fan speed' + echo 'graph_args -l 0 -u 100' + echo 'graph_vlabel Percentage' + echo 'graph_category gpu' + echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion" + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` + echo "fan${nGpusCounter}.info Fan information for $gpuName" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + *) + echo "Can't run without a proper symlink. Exiting." + echo "Try running munin-node-configure --suggest." + exit 1 + ;; + esac + + # Common stuff for all quantities + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` + echo "${name}${nGpusCounter}.label $gpuName" + : $(( nGpusCounter = $nGpusCounter + 1 )) + #print_warning $name + #print_critical $name + done + + exit 0 +fi + +# Get requested value +case $name in + temp) + valueGpus=`echo "$smiOutput" | grep -A 1 "Temperature" | grep "Gpu" | cut -d : -f 2 | cut -d ' ' -f 2` + ;; + mem) + totalMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2` + usedMemGpus=`echo "$smiOutput" | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2` + valueGpus='' + nGpusCounter=0 + while [ $nGpusCounter -lt $nGpus ] + do + totalMemGpu=`echo "$totalMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` + usedMemGpu=`echo "$usedMemGpus" | sed -n $(( $nGpusCounter + 1 ))p` + percentMemUsed=$(( $usedMemGpu * 100 / $totalMemGpu )) + valueGpus="${valueGpus}${percentMemUsed}\n" + : $(( nGpusCounter = $nGpusCounter + 1 )) + done + ;; + fan) + valueGpus=`echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2` + ;; + *) + echo "Can't run without a proper symlink. Exiting." + echo "Try running munin-node-configure --suggest." + exit 1 + ;; + esac + + +# Print requested value +nGpusCounter=0 +while [ $nGpusCounter -lt $nGpus ] +do + value=`echo "$valueGpus" | sed -n $(( $nGpusCounter + 1 ))p` + echo "${name}${nGpusCounter}.value $value" + : $(( nGpusCounter = $nGpusCounter + 1 )) +done + + +