mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-22 02:51:03 +00:00
Configurable warning and critical temperatures for GPUs
This commit is contained in:
parent
758ca724a0
commit
10b1de81bb
2 changed files with 10 additions and 9 deletions
|
@ -9,7 +9,7 @@ amd_gpu_ - Wildcard plugin to monitor AMD GPUs. Uses aticonfig utility,
|
||||||
usually bundled with AMD GPU driver, to obtain information. To use this
|
usually bundled with AMD GPU driver, to obtain information. To use this
|
||||||
plugin you have to make sure aticonfig will run without an active X
|
plugin you have to make sure aticonfig will run without an active X
|
||||||
server (i.e. without anyone being logged in via the GUI). For more
|
server (i.e. without anyone being logged in via the GUI). For more
|
||||||
information on this visit this link:
|
information about this issue visit the link below:
|
||||||
http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/
|
http://www.mayankdaga.com/running-opencl-applications-remotely-on-amd-gpus/
|
||||||
|
|
||||||
=head1 CONFIGURATION
|
=head1 CONFIGURATION
|
||||||
|
@ -20,8 +20,10 @@ value to monitor.
|
||||||
This plugin uses the following configuration variables:
|
This plugin uses the following configuration variables:
|
||||||
|
|
||||||
[amd_gpu_*]
|
[amd_gpu_*]
|
||||||
env.aticonfexec - Location of aticonfig executable.
|
|
||||||
user root
|
user root
|
||||||
|
env.aticonfexec - Location of aticonfig executable.
|
||||||
|
env.warning - Warning temperature
|
||||||
|
env.critical - Critical temperature
|
||||||
|
|
||||||
=head2 DEFAULT CONFIGURATION
|
=head2 DEFAULT CONFIGURATION
|
||||||
|
|
||||||
|
@ -105,8 +107,8 @@ if [ "$1" = "config" ]; then
|
||||||
while [ $nGpusCounter -lt $nGpus ]
|
while [ $nGpusCounter -lt $nGpus ]
|
||||||
do
|
do
|
||||||
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
|
gpuName=`echo "$nGpusOutput" | grep "* 0" | cut -f 1,3 --complement -d " "`
|
||||||
echo "temp${nGpusCounter}.warning 75"
|
echo "temp${nGpusCounter}.warning ${warning:-75}"
|
||||||
echo "temp${nGpusCounter}.critical 95"
|
echo "temp${nGpusCounter}.critical ${critical:-95}"
|
||||||
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
||||||
echo "temp${nGpusCounter}.label Temperature ($gpuName)"
|
echo "temp${nGpusCounter}.label Temperature ($gpuName)"
|
||||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||||
|
@ -232,7 +234,6 @@ do
|
||||||
done
|
done
|
||||||
|
|
||||||
# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data.
|
# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data.
|
||||||
# TODO Put warning and critical as vars in config with sensible defaults
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,8 @@ This plugin uses the following configuration variables:
|
||||||
|
|
||||||
[nvidia_gpu_*]
|
[nvidia_gpu_*]
|
||||||
env.smiexec - Location of nvidia-smi executable.
|
env.smiexec - Location of nvidia-smi executable.
|
||||||
|
env.warning - Warning temperature
|
||||||
|
env.critical - Critical temperature
|
||||||
|
|
||||||
=head2 DEFAULT CONFIGURATION
|
=head2 DEFAULT CONFIGURATION
|
||||||
|
|
||||||
|
@ -101,8 +103,8 @@ if [ "$1" = "config" ]; then
|
||||||
while [ $nGpusCounter -lt $nGpus ]
|
while [ $nGpusCounter -lt $nGpus ]
|
||||||
do
|
do
|
||||||
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
gpuName=`echo "$nGpusOutput" | sed -n $(( $nGpusCounter + 1 ))p | cut -d \( -f 1`
|
||||||
echo "temp${nGpusCounter}.warning 75"
|
echo "temp${nGpusCounter}.warning ${warning:-75}"
|
||||||
echo "temp${nGpusCounter}.critical 95"
|
echo "temp${nGpusCounter}.critical ${critical:-95}"
|
||||||
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
echo "temp${nGpusCounter}.info Temperature information for $gpuName"
|
||||||
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
: $(( nGpusCounter = $nGpusCounter + 1 ))
|
||||||
done
|
done
|
||||||
|
@ -205,8 +207,6 @@ do
|
||||||
done
|
done
|
||||||
|
|
||||||
# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data.
|
# TODO Follow multigraph suggestion from Flameeyes to look into multigraph plugins http://munin-monitoring.org/wiki/MultigraphSampleOutput, in order to reduce the amount of round trips to get the data.
|
||||||
# TODO Put warning and critical as vars in config with sensible defaults
|
|
||||||
|
|
||||||
# TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not)
|
# TODO Nvidia only: Add unsupported output options from nvidia-smi for those who have that option (how to test?). Test if they are supported and put them in suggest (or not) in case they are supported (or not)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue