From 10b1de81bbd2c92e9fb9e897e38231ff8903729a Mon Sep 17 00:00:00 2001 From: Nuno Fachada Date: Tue, 12 Nov 2013 11:29:12 +0000 Subject: [PATCH] Configurable warning and critical temperatures for GPUs --- plugins/gpu/amd_gpu_ | 11 ++++++----- plugins/gpu/nvidia_gpu_ | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/plugins/gpu/amd_gpu_ b/plugins/gpu/amd_gpu_ index 2945cbd9..ddfb812f 100755 --- a/plugins/gpu/amd_gpu_ +++ b/plugins/gpu/amd_gpu_ @@ -9,7 +9,7 @@ amd_gpu_ - Wildcard plugin to monitor AMD GPUs. Uses aticonfig utility, usually bundled with AMD GPU driver, to obtain information. To use this plugin you have to make sure aticonfig will run without an active X server (i.e. without anyone being logged in via the GUI). For more -information on this visit this link: +information about this issue visit the link below: http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/ =head1 CONFIGURATION @@ -20,8 +20,10 @@ value to monitor. This plugin uses the following configuration variables: [amd_gpu_*] - env.aticonfexec - Location of aticonfig executable. user root + env.aticonfexec - Location of aticonfig executable. + env.warning - Warning temperature + env.critical - Critical temperature =head2 DEFAULT CONFIGURATION @@ -105,8 +107,8 @@ if [ "$1" = "config" ]; then while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "` - echo "temp${nGpusCounter}.warning 75" - echo "temp${nGpusCounter}.critical 95" + echo "temp${nGpusCounter}.warning ${warning:-75}" + echo "temp${nGpusCounter}.critical ${critical:-95}" echo "temp${nGpusCounter}.info Temperature information for $gpuName" echo "temp${nGpusCounter}.label Temperature ($gpuName)" : $(( nGpusCounter = $nGpusCounter + 1 )) @@ -232,7 +234,6 @@ do done # TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. -# TODO Put warning and critical as vars in config with sensible defaults diff --git a/plugins/gpu/nvidia_gpu_ b/plugins/gpu/nvidia_gpu_ index df4db473..f372fb7c 100755 --- a/plugins/gpu/nvidia_gpu_ +++ b/plugins/gpu/nvidia_gpu_ @@ -17,6 +17,8 @@ This plugin uses the following configuration variables: [nvidia_gpu_*] env.smiexec - Location of nvidia-smi executable. + env.warning - Warning temperature + env.critical - Critical temperature =head2 DEFAULT CONFIGURATION @@ -101,8 +103,8 @@ if [ "$1" = "config" ]; then while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` - echo "temp${nGpusCounter}.warning 75" - echo "temp${nGpusCounter}.critical 95" + echo "temp${nGpusCounter}.warning ${warning:-75}" + echo "temp${nGpusCounter}.critical ${critical:-95}" echo "temp${nGpusCounter}.info Temperature information for $gpuName" : $(( nGpusCounter = $nGpusCounter + 1 )) done @@ -205,8 +207,6 @@ do done # TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. -# TODO Put warning and critical as vars in config with sensible defaults - # TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not)