From 758ca724a0fd83269cabb7020e8824f68a32276e Mon Sep 17 00:00:00 2001 From: Nuno Fachada Date: Sat, 29 Dec 2012 01:58:31 +0000 Subject: [PATCH 1/3] Add TODO and FIXME notes on some issues TODO 1: Follow multigraph suggestion from Flameeyes to look into multigraph plugins (http://munin-monitoring.org/wiki/MultigraphSampleOutput), in order to reduce the amount of round trips to get the data. TODO 2: Put warning and critical as vars in config with sensible defaults. TODO 3: Add additional output options for nvidia-smi only available for professional GPUs. FIXME: Possible bug in lines 87-91 of amd_gpu_ --- plugins/gpu/amd_gpu_ | 4 ++++ plugins/gpu/nvidia_gpu_ | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/plugins/gpu/amd_gpu_ b/plugins/gpu/amd_gpu_ index bd4c8af1..2945cbd9 100755 --- a/plugins/gpu/amd_gpu_ +++ b/plugins/gpu/amd_gpu_ @@ -83,6 +83,7 @@ nGpusOutput=`$atiConfigExec --list-adapters` nGpus=`echo "$nGpusOutput" | wc -l` nGpus=$((nGpus - 2)) # Last two lines don't matter +# FIXME Possible bug in code bellow: maybe should be <= 0 instead of == 0? if [ $nGpus -eq 0 ]; then # Exit if no GPUs found echo "No AMD GPUs detected. Exiting." @@ -230,6 +231,9 @@ do : $(( nGpusCounter = $nGpusCounter + 1 )) done +# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. +# TODO Put warning and critical as vars in config with sensible defaults + diff --git a/plugins/gpu/nvidia_gpu_ b/plugins/gpu/nvidia_gpu_ index a43f5989..df4db473 100755 --- a/plugins/gpu/nvidia_gpu_ +++ b/plugins/gpu/nvidia_gpu_ @@ -204,5 +204,11 @@ do : $(( nGpusCounter = $nGpusCounter + 1 )) done +# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. +# TODO Put warning and critical as vars in config with sensible defaults + +# TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not) + + From 10b1de81bbd2c92e9fb9e897e38231ff8903729a Mon Sep 17 00:00:00 2001 From: Nuno Fachada Date: Tue, 12 Nov 2013 11:29:12 +0000 Subject: [PATCH 2/3] Configurable warning and critical temperatures for GPUs --- plugins/gpu/amd_gpu_ | 11 ++++++----- plugins/gpu/nvidia_gpu_ | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/plugins/gpu/amd_gpu_ b/plugins/gpu/amd_gpu_ index 2945cbd9..ddfb812f 100755 --- a/plugins/gpu/amd_gpu_ +++ b/plugins/gpu/amd_gpu_ @@ -9,7 +9,7 @@ amd_gpu_ - Wildcard plugin to monitor AMD GPUs. Uses aticonfig utility, usually bundled with AMD GPU driver, to obtain information. To use this plugin you have to make sure aticonfig will run without an active X server (i.e. without anyone being logged in via the GUI). For more -information on this visit this link: +information about this issue visit the link below: http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/ =head1 CONFIGURATION @@ -20,8 +20,10 @@ value to monitor. This plugin uses the following configuration variables: [amd_gpu_*] - env.aticonfexec - Location of aticonfig executable. user root + env.aticonfexec - Location of aticonfig executable. + env.warning - Warning temperature + env.critical - Critical temperature =head2 DEFAULT CONFIGURATION @@ -105,8 +107,8 @@ if [ "$1" = "config" ]; then while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "` - echo "temp${nGpusCounter}.warning 75" - echo "temp${nGpusCounter}.critical 95" + echo "temp${nGpusCounter}.warning ${warning:-75}" + echo "temp${nGpusCounter}.critical ${critical:-95}" echo "temp${nGpusCounter}.info Temperature information for $gpuName" echo "temp${nGpusCounter}.label Temperature ($gpuName)" : $(( nGpusCounter = $nGpusCounter + 1 )) @@ -232,7 +234,6 @@ do done # TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. -# TODO Put warning and critical as vars in config with sensible defaults diff --git a/plugins/gpu/nvidia_gpu_ b/plugins/gpu/nvidia_gpu_ index df4db473..f372fb7c 100755 --- a/plugins/gpu/nvidia_gpu_ +++ b/plugins/gpu/nvidia_gpu_ @@ -17,6 +17,8 @@ This plugin uses the following configuration variables: [nvidia_gpu_*] env.smiexec - Location of nvidia-smi executable. + env.warning - Warning temperature + env.critical - Critical temperature =head2 DEFAULT CONFIGURATION @@ -101,8 +103,8 @@ if [ "$1" = "config" ]; then while [ $nGpusCounter -lt $nGpus ] do gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1` - echo "temp${nGpusCounter}.warning 75" - echo "temp${nGpusCounter}.critical 95" + echo "temp${nGpusCounter}.warning ${warning:-75}" + echo "temp${nGpusCounter}.critical ${critical:-95}" echo "temp${nGpusCounter}.info Temperature information for $gpuName" : $(( nGpusCounter = $nGpusCounter + 1 )) done @@ -205,8 +207,6 @@ do done # TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. -# TODO Put warning and critical as vars in config with sensible defaults - # TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not) From c53197ce5a01eec368ff56c553456dcd6f57ab87 Mon Sep 17 00:00:00 2001 From: Nuno Fachada Date: Tue, 12 Nov 2013 13:47:45 +0000 Subject: [PATCH 3/3] Improve GPU plugins documentation --- plugins/gpu/amd_gpu_ | 18 ++++++++++++------ plugins/gpu/nvidia_gpu_ | 22 +++++++++++++++++----- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/plugins/gpu/amd_gpu_ b/plugins/gpu/amd_gpu_ index ddfb812f..4f7095e0 100755 --- a/plugins/gpu/amd_gpu_ +++ b/plugins/gpu/amd_gpu_ @@ -27,7 +27,8 @@ This plugin uses the following configuration variables: =head2 DEFAULT CONFIGURATION -The default configuration is to set "env.aticonfexec" to /usr/bin/aticonfig. +The default configuration is to set "env.aticonfexec" to /usr/bin/aticonfig and +assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. =head2 EXAMPLE WILDCARD USAGE @@ -35,6 +36,16 @@ C ...will monitor the temperature of available AMD GPUs. +=head1 TODO + +=over 4 + +=item * + +Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). + +=back + =head1 AUTHOR Nuno Fachada @@ -85,7 +96,6 @@ nGpusOutput=`$atiConfigExec --list-adapters` nGpus=`echo "$nGpusOutput" | wc -l` nGpus=$((nGpus - 2)) # Last two lines don't matter -# FIXME Possible bug in code bellow: maybe should be <= 0 instead of == 0? if [ $nGpus -eq 0 ]; then # Exit if no GPUs found echo "No AMD GPUs detected. Exiting." @@ -233,8 +243,4 @@ do : $(( nGpusCounter = $nGpusCounter + 1 )) done -# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. - - - diff --git a/plugins/gpu/nvidia_gpu_ b/plugins/gpu/nvidia_gpu_ index f372fb7c..38d67377 100755 --- a/plugins/gpu/nvidia_gpu_ +++ b/plugins/gpu/nvidia_gpu_ @@ -22,7 +22,8 @@ This plugin uses the following configuration variables: =head2 DEFAULT CONFIGURATION -The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi. +The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and +assume warning and critical temperatures of 75 and 95 degrees celsius, respectively. =head2 EXAMPLE WILDCARD USAGE @@ -30,6 +31,21 @@ C ...will monitor the temperature of available GPUs. +=head1 TODO + +=over 4 + +=item * + +Add support for specific professional GPU features such as number of compute +processes, clocks, power draw, utilization, and so on. + +=item * + +Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput). + +=back + =head1 AUTHOR Nuno Fachada @@ -206,9 +222,5 @@ do : $(( nGpusCounter = $nGpusCounter + 1 )) done -# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data. -# TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not) - -