1
0
Fork 0
mirror of https://github.com/munin-monitoring/contrib.git synced 2025-07-21 18:41:03 +00:00
Munin-Contrib/plugins/gpu/nvidia_gpu_by_user
2018-10-01 11:23:36 +09:00

253 lines
5.5 KiB
Bash
Executable file

#!/bin/sh
# -*- sh -*-
: <<EOF #
=cut
=head1 NAME
nvidia_gpu_by_user - Plugin to monitor GPU memory usage by user.
=head1 CONFIGURATION
Add this to node configuration file:
[nvidia_gpu_by_user]
env.smiexec - Location of nvidia-smi executable.
env.gpuusers - List of the username to monitor(space separated).
=head1 USAGE
If env.gpuusers is set, graph always shows listed users
(root, user1, user2 in example below) whether using GPU or not.
Otherwise, graph shows users that are using the GPU right now only.
Example:
[nvidia_gpu_by_user]
env.smiexec /path/to/nvidia-smi
env.gpuusers root user1 user2
=head1 AUTHOR
Hideki Takano
ymhtakano@gmail.com
=head1 LICENSE
GPLv2
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf
=cut
EOF
. "$MUNIN_LIBDIR/plugins/plugin.sh"
# Get location of nvidia-smi executable or use default
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
# Get gpuusers
gpuUSERS=${gpuusers:-""}
# Check if autoconf was requested
if [ "$1" = "autoconf" ]; then
# Autoconf only returns yes if nvidia-smi exists and is executable
if [ -x "$nvSmiExec" ]; then
echo yes
exit 0
else
echo "no (nvidia-smi executable not found)"
exit 0
fi
fi
# GPU usage
smiOutput=$("$nvSmiExec" -q)
smiInfo=$(echo "$smiOutput" | grep -A 3 -E "(Product Name|GPU UUID|Process ID|FB Memory Usage)" | grep -E "(Product Name|GPU UUID|Process ID|Total|Used GPU Memory)")
# config to sort user by a-z (1:on, 0:off)
printGraphOrder=0
# output graph data
echo "$smiInfo" | \
sed "s/^ *//g" | \
sed "s/: */:/g" | \
awk -F':' -v arg="$1" -v gpuUsers="$gpuUSERS" -v order="$printGraphOrder" '
BEGIN {
n=-1;
split("", gpu);
stderr="/dev/stderr"
}
$0 ~ "^Product Name" {
n++;
m=0;
gpu["name", n] = $2
}
$0 ~ "^GPU UUID" {
gpu["id", n] = $2
}
$0 ~ "^Total" {
split ($2, tmp, " ");
gpu["total", n] = tmp[1];
}
$0 ~ "^Process ID" {
"ps -axo pid,user | sed \"s/^ *//g\" | grep \"^"$2" \" 2>/dev/null | cut -d\" \" -f 2 | sed -e \"s/^[^A-Za-z_]/_/\" -e \"s/[^A-Za-z0-9_]/_/g\" | tr \"\n\" \" \" | tr -d \" \"" | getline tmpid
if (tmpid == "") tmpid = "other";
m = getUserIdxInGpu(n, tmpid);
gpu["user", n, m] = tmpid;
}
$0 ~ "^Used GPU Memory" {
split ($2, tmp, " ");
if (gpu["used", n, m] == "") gpu["used", n, m] = tmp[1];
else gpu["used", n, m] += tmp[1];
}
END {
if (n < 0) {
print "No NVIDIA GPUs detected. Exiting." > stderr;
exit 1;
}
# add other 0% if not exists (for displaying graph)
split (gpuUsers, gu_array, " ");
gu_idx = 1;
while (gu_array[gu_idx] != "") {
gu = gu_array[gu_idx];
for (i=0; i<=n; i++) {
j = getUserIdxInGpu(i, gu);
if (j == getUserCountInGpu(i)) {
gpu["user", i, j] = gu;
gpu["used", i, j] = "0";
}
}
gu_idx++;
}
if (arg == "config") {
# print graph summary
print "multigraph gpu_multigraph"
print "graph_title GPU memory usage by user"
print "graph_args --base 1000 -r --lower-limit 0"
print "graph_category gpu_by_user"
print "graph_info This graph shows GPU memory usage, for monitored users."
print "graph_vlabel %"
print "graph_period second"
if (order == 1) {
printf "graph_order"
for (i=0; i<=n; i++) {
gu_idx = 1;
while (gu_array[gu_idx] != "") {
gu = gu_array[gu_idx];
printf (" gpu%s_%s", i, gu);
gu_idx++;
}
}
print ""
}
for (i=0; i<=n; i++) {
m = getUserCountInGpu(i);
for (j=0; j<m; j++) {
p = "gpu" i "_" gpu["user", i, j];
print p ".label " p;
print p ".info GPU" i " used by " gpu["user", i, j];
print p ".min 0"
print p ".draw AREASTACK"
print p ".type GAUGE";
}
}
printf ("graph_info FB Memory usage for NVIDIA GPUs (total memory is: %s in GPU%d", gpu["total", n], 0);
for (i=1; i<=n; i++) {
printf (", %s in GPU%d", gpu["total", n], i);
}
printf ")\n\n";
for (i=0; i<=n; i++) {
print "multigraph gpu_multigraph.gpu" i;
print "graph_info Memory information for " gpu["name", i];
print "graph_title GPU" i " memory usage by user"
print "graph_args --base 1000 -r --lower-limit 0 --upper-limit 100"
print "graph_category gpu_by_user"
print "graph_info This graph shows GPU" i " memory usage, for monitored users."
print "graph_vlabel %"
print "graph_scale no"
print "graph_period second"
if (order == 1) {
printf "graph_order"
gu_idx = 1;
while (gu_array[gu_idx] != "") {
gu = gu_array[gu_idx];
printf (" gpu%s_%s", i, gu);
gu_idx++;
}
print ""
}
m = getUserCountInGpu(i);
for (j=0; j<m; j++) {
p = "gpu" i "_" gpu["user", i, j];
print p ".label " p;
print p ".info GPU" i " used by " gpu["user", i, j];
print p ".min 0"
print p ".draw AREASTACK"
print p ".type GAUGE"; }
print ""
}
}
else {
# print graph value
print "multigraph gpu_multigraph"
for (i=0; i<=n; i++) {
m = getUserCountInGpu(i);
for (j=0; j<m; j++) {
print "gpu" i "_" gpu["user", i, j] ".value " getTwoDecimalPlaces(100.0 * gpu["used", i, j] / gpu["total", i]);
}
}
print ""
for (i=0; i<=n; i++) {
print "multigraph gpu_multigraph.gpu" i;
m = getUserCountInGpu(i);
for (j=0; j<m; j++) {
print "gpu" i "_" gpu["user", i, j] ".value " getTwoDecimalPlaces(100.0 * gpu["used", i, j] / gpu["total", i]);
}
print ""
}
}
}
function getTwoDecimalPlaces(_n) {
return int(_n * 100 + 0.5) / 100.0;
}
function getUserIdxInGpu(_n, _user) {
j = 0;
while (gpu["user", _n, j] != "") {
if (gpu["user", _n, j] == _user) return j;
j++;
}
return j;
}
function getUserCountInGpu(_n) {
j = 0;
while (gpu["user", _n, j] != "") {
j++;
}
return j;
}
'