mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-24 18:07:20 +00:00
Initial version
This commit is contained in:
parent
6ce22c9d10
commit
f1cbf1ac35
1 changed files with 438 additions and 0 deletions
438
plugins/other/boinc_estwk
Executable file
438
plugins/other/boinc_estwk
Executable file
|
@ -0,0 +1,438 @@
|
|||
#!/usr/bin/perl -w
|
||||
#
|
||||
# boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs
|
||||
#
|
||||
# Run 'perldoc boinc_estwk' for full man page
|
||||
#
|
||||
# Author: Palo M. <palo.gm@gmail.com>
|
||||
# License: GPLv3 <http://www.gnu.org/licenses/gpl-3.0.txt>
|
||||
#
|
||||
#
|
||||
# Parameters supported:
|
||||
# config
|
||||
#
|
||||
#
|
||||
# Configurable variables
|
||||
# boinccmd - command-line control program (default: boinccmd)
|
||||
# host - Host to query (default: none)
|
||||
# port - GUI RPC port (default: none = use BOINC-default)
|
||||
# boincdir - Directory containing appropriate password file
|
||||
# gui_rpc_auth.cfg (default: none)
|
||||
# estwk_warn - Warning level - minimum estimated work (default: 24.00 hours)
|
||||
# password - Password for BOINC (default: none) !!! UNSAFE !!!
|
||||
#
|
||||
#
|
||||
# $Log$
|
||||
#
|
||||
# Revision 1.0 2009/09/13 Palo M.
|
||||
# Add documentation and license information
|
||||
# Ready to publish on Munin Exchange
|
||||
# Revision 0.9 2009/09/13 Palo M.
|
||||
# Add possibility to read password from file
|
||||
# Revision 0.8 2009/09/12 Palo M.
|
||||
# Update default binary name: boinc_cmd -> boinccmd
|
||||
# Revision 0.7 2008/08/30 Palo M.
|
||||
# Creation - Attempt to port functionality from C++ code
|
||||
#
|
||||
# (Revisions 0.1 - 0.6) were done in C++
|
||||
#
|
||||
#
|
||||
#
|
||||
# Magic markers:
|
||||
#%# family=contrib
|
||||
|
||||
use strict;
|
||||
|
||||
|
||||
#########################################################################
|
||||
# 1. Parse configuration variables
|
||||
#
|
||||
my $BOINCCMD = exists $ENV{'boinccmd'} ? $ENV{'boinccmd'} : "boinccmd";
|
||||
my $HOST = exists $ENV{'host'} ? $ENV{'host'} : undef;
|
||||
my $PORT = exists $ENV{'port'} ? $ENV{'port'} : undef;
|
||||
my $PASSWORD = exists $ENV{'password'} ? $ENV{'password'} : undef;
|
||||
my $BOINCDIR = exists $ENV{'boincdir'} ? $ENV{'boincdir'} : undef;
|
||||
my $ESTWKWRN = exists $ENV{'estwk_warn'} ? $ENV{'estwk_warn'} : 24;
|
||||
|
||||
#########################################################################
|
||||
# 2. Basic executable
|
||||
#
|
||||
if (defined $HOST) {
|
||||
$BOINCCMD .= " --host $HOST";
|
||||
if (defined $PORT) {
|
||||
$BOINCCMD .= ":$PORT";
|
||||
}
|
||||
}
|
||||
if (defined $PASSWORD) {
|
||||
$BOINCCMD .= " --passwd $PASSWORD";
|
||||
}
|
||||
if (defined $BOINCDIR) {
|
||||
chdir $BOINCDIR;
|
||||
}
|
||||
|
||||
#########################################################################
|
||||
# 3. Get host info, to retrieve number of CPUs
|
||||
#
|
||||
my $nCPUs;
|
||||
my $hostInfo = `$BOINCCMD --get_host_info 2>/dev/null`;
|
||||
if ($hostInfo ne "") {
|
||||
my @hostInfo = split /\n/, $hostInfo;
|
||||
my @nCPUs = grep /^\s+#CPUS: /,@hostInfo;
|
||||
if ($#nCPUs != 0) { die "Unexpected output from boinccmd"; }
|
||||
$nCPUs = $nCPUs[0];
|
||||
$nCPUs =~ s/^\s+#CPUS: //;
|
||||
no warnings; # for following line only
|
||||
if ($nCPUs < 1) { die "Unexpected output from boinccmd"; }
|
||||
}
|
||||
else {
|
||||
# No host info (e.g. client not running)
|
||||
exit -1;
|
||||
}
|
||||
|
||||
#print "$nCPUs\n";
|
||||
|
||||
#########################################################################
|
||||
# 4. Display config if applicable
|
||||
#
|
||||
if ( (defined $ARGV[0]) && ($ARGV[0] eq "config") ) {
|
||||
|
||||
if (defined $HOST) {
|
||||
print "host_name $HOST\n";
|
||||
}
|
||||
|
||||
print "graph_title BOINC work cache estimation\n";
|
||||
print "graph_category BOINC\n";
|
||||
print "graph_args --base 1000 -l 0 --alt-autoscale-max\n";
|
||||
print "graph_vlabel Hours\n";
|
||||
print "graph_scale no\n";
|
||||
|
||||
# Longest WU is AREA, each CPU estimated is LINE2
|
||||
print "longest.label Longest WU\n";
|
||||
print "longest.draw AREA\n";
|
||||
print "longest.type GAUGE\n";
|
||||
for (my $i = 0; $i < $nCPUs; ++$i) {
|
||||
print "cpu$i.label CPU$i\n";
|
||||
print "cpu$i.draw LINE2\n";
|
||||
print "cpu$i.type GAUGE\n";
|
||||
printf "cpu$i.warning %.2f:\n",$ESTWKWRN;
|
||||
print "cpu$i.critical 0:\n";
|
||||
}
|
||||
|
||||
exit 0;
|
||||
}
|
||||
|
||||
#########################################################################
|
||||
# 5. Fetch all needed data from BOINC-client with single call
|
||||
#
|
||||
my $prj_status = "";
|
||||
my $results = "";
|
||||
|
||||
my $simpleGuiInfo = `$BOINCCMD --get_simple_gui_info 2>/dev/null`;
|
||||
if ($simpleGuiInfo ne "") {
|
||||
# Some data were retrieved, so let's split them
|
||||
my @sections;
|
||||
my @section1;
|
||||
@sections = split /=+ Projects =+\n/, $simpleGuiInfo;
|
||||
@section1 = split /=+ [A-z]+ =+\n/, $sections[1];
|
||||
$prj_status = $section1[0];
|
||||
|
||||
@sections = split /=+ Results =+\n/, $simpleGuiInfo;
|
||||
@section1 = split /=+ [A-z]+ =+\n/, $sections[1];
|
||||
$results = $section1[0];
|
||||
}
|
||||
|
||||
#########################################################################
|
||||
# 6. Parse BOINC data
|
||||
#
|
||||
# 6.a) Get suspended projects
|
||||
my @prjInfos = split /\d+\) -+\n/, $prj_status;
|
||||
shift @prjInfos; # Throw out first empty line
|
||||
|
||||
my @susp_projects; # array of suspended projects
|
||||
for my $prj_info (@prjInfos) {
|
||||
my @lines = split /\n/, $prj_info;
|
||||
my @prjURL = grep /^\s+master URL: /,@lines;
|
||||
if ($#prjURL != 0) {die "Unexpected output from boinccmd"; }
|
||||
my $prjURL =$prjURL[0];
|
||||
$prjURL =~ s/^\s+master URL: //;
|
||||
my @suspGUI = grep /^\s+suspended via GUI: /,@lines;
|
||||
if ($#suspGUI != 0) {die "Unexpected output from boinccmd"; }
|
||||
my $suspGUI =$suspGUI[0];
|
||||
$suspGUI =~ s/^\s+suspended via GUI: //;
|
||||
if ($suspGUI eq "yes") {
|
||||
push @susp_projects, $prjURL
|
||||
}
|
||||
}
|
||||
for my $i (@susp_projects) { print "$i\n"; }
|
||||
|
||||
# 6.b) Parse results, check their states
|
||||
# Get those which are NOT suspended by GUI
|
||||
my @rsltInfos = split /\d+\) -+\n/, $results;
|
||||
shift @rsltInfos; # Throw out first empty line
|
||||
my @rsltRemain;
|
||||
|
||||
for my $rslt_info (@rsltInfos) {
|
||||
my @lines = split /\n/, $rslt_info;
|
||||
my @estRemain = grep /^\s+estimated CPU time remaining: /,@lines;
|
||||
my $estRemain = $estRemain[0];
|
||||
$estRemain =~ s/^\s+estimated CPU time remaining: //;
|
||||
my @schedstat = grep /^\s+scheduler state: /,@lines;
|
||||
my $schedstat = $schedstat[0];
|
||||
$schedstat =~ s/^\s+scheduler state: //;
|
||||
my @state = grep /^\s+state: /,@lines;
|
||||
my $state = $state[0];
|
||||
$state =~ s/^\s+state: //;
|
||||
my @acttask = grep /^\s+active_task_state: /,@lines;
|
||||
my $acttask = $acttask[0];
|
||||
$acttask =~ s/^\s+active_task_state: //;
|
||||
my @suspGUI = grep /^\s+suspended via GUI: /,@lines;
|
||||
my $suspGUI =$suspGUI[0];
|
||||
$suspGUI =~ s/^\s+suspended via GUI: //;
|
||||
my @prjURL = grep /^\s+project URL: /,@lines;
|
||||
my $prjURL =$prjURL[0];
|
||||
$prjURL =~ s/^\s+project URL: //;
|
||||
if ($suspGUI eq "yes") {
|
||||
# This result is not in work cache - at the moment
|
||||
next;
|
||||
}
|
||||
my @suspPRJ = grep /^$prjURL$/,@susp_projects;
|
||||
if ($#suspPRJ == 0) {
|
||||
# This result is not in work cache - at the moment
|
||||
next;
|
||||
}
|
||||
if ($state eq "2") {
|
||||
# RESULT_FILES_DOWNLOADED
|
||||
if ( ($schedstat eq "0") ||
|
||||
($schedstat eq "1") ) {
|
||||
# CPU_SCHED_UNINITIALIZED 0
|
||||
# Not started yet: result is available in work cache
|
||||
# CPU_SCHED_PREEMPTED 1
|
||||
# preempted: result is available in work cache
|
||||
push @rsltRemain,$estRemain;
|
||||
next;
|
||||
}
|
||||
if ($schedstat eq "2") {
|
||||
# CPU_SCHED_SCHEDULED 2
|
||||
if ( ($acttask eq "1") ||
|
||||
($acttask eq "0") ||
|
||||
($acttask eq "9") ) {
|
||||
# PROCESS_EXECUTING 1
|
||||
# running
|
||||
# PROCESS_UNINITIALIZED 0
|
||||
# PROCESS_SUSPENDED 9
|
||||
# suspended by "user active"/benchmark?
|
||||
# available in work cache
|
||||
push @rsltRemain,$estRemain;
|
||||
next;
|
||||
}
|
||||
# other active-task-state - maybe failing/aborted WU
|
||||
# => not in work cache
|
||||
next;
|
||||
}
|
||||
# There should be no other scheduler state
|
||||
next;
|
||||
}
|
||||
# RESULT_FILES_DOWNLOADING
|
||||
# RESULT_COMPUTE_ERROR
|
||||
# RESULT_FILES_UPLOADING
|
||||
# RESULT_FILES_UPLOADED
|
||||
# RESULT_ABORTED
|
||||
# => not in work cache
|
||||
}
|
||||
|
||||
#########################################################################
|
||||
# 7. Distribute remaining results per CPUs
|
||||
#
|
||||
# 7.a) Sort remaining results descending
|
||||
my @sortRemain = sort {$b <=> $a} @rsltRemain;
|
||||
|
||||
# 7.b) Assign to CPU with smallest workcache
|
||||
my @CPUcache;
|
||||
for (my $i = 0; $i < $nCPUs; ++$i) {
|
||||
$CPUcache[$i] = 0;
|
||||
}
|
||||
|
||||
for my $length (@sortRemain) {
|
||||
# find CPU with smallest workcache:
|
||||
my @sortedCPUs = sort {$a <=> $b} @CPUcache;
|
||||
$sortedCPUs[0] = $sortedCPUs[0] + $length;
|
||||
@CPUcache = @sortedCPUs;
|
||||
}
|
||||
|
||||
# At the end, sort CPUs descending
|
||||
@CPUcache = sort {$b <=> $a} @CPUcache;
|
||||
|
||||
#########################################################################
|
||||
# 8. Display output
|
||||
#
|
||||
|
||||
# Convert from seconds to hours
|
||||
printf "longest.value %.2f\n",$sortRemain[0]/3600;
|
||||
for (my $i = 0; $i < $nCPUs; ++$i) {
|
||||
printf "cpu$i.value %.2f\n",$CPUcache[$i]/3600;
|
||||
}
|
||||
|
||||
exit 0;
|
||||
|
||||
|
||||
#########################################################################
|
||||
# perldoc section
|
||||
|
||||
=head1 NAME
|
||||
|
||||
boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs
|
||||
|
||||
=head1 APPLICABLE SYSTEMS
|
||||
|
||||
Linux machines running BOINC and munin-node
|
||||
|
||||
- or -
|
||||
|
||||
Linux servers (running munin-node) used to collect data from other systems
|
||||
which are running BOINC, but not running munin-node (e.g. non-Linux systems)
|
||||
|
||||
=head1 CONFIGURATION
|
||||
|
||||
Following configuration variables are supported:
|
||||
|
||||
=over 12
|
||||
|
||||
=item B<boinccmd>
|
||||
|
||||
command-line control program (default: boinccmd)
|
||||
|
||||
=item B<host>
|
||||
|
||||
Host to query (default: none)
|
||||
|
||||
=item B<port>
|
||||
|
||||
GUI RPC port (default: none = use BOINC-default)
|
||||
|
||||
=item B<boincdir>
|
||||
|
||||
Directory containing appropriate file gui_rpc_auth.cfg (default: none)
|
||||
|
||||
=item B<estwk_warn>
|
||||
|
||||
Warning level - minimum estimated work (default: 24.00 hours)
|
||||
|
||||
=item B<password>
|
||||
|
||||
Password for BOINC (default: none)
|
||||
|
||||
=back
|
||||
|
||||
=head2 B<Security Consideration:>
|
||||
|
||||
Using of variable B<password> poses a security risk. Even if the Munin
|
||||
configuration file for this plugin containing BOINC-password is properly
|
||||
protected, the password is exposed as environment variable and finally passed
|
||||
to boinccmd as a parameter. It is therefore possible for local users of the
|
||||
machine running this plugin to eavesdrop the BOINC password.
|
||||
|
||||
Using of variable password is therefore strongly discouraged and is left here
|
||||
as a legacy option and for testing purposes.
|
||||
|
||||
It should be always possible to use B<boincdir> variable instead - in such case
|
||||
the file gui_rpc_auth.cfg is read by boinccmd binary directly.
|
||||
If this plugin is used to fetch data from remote system, the gui_rpc_auth.cfg
|
||||
can be copied to special directory in a secure way (e.g. via scp) and properly
|
||||
protected by file permissions.
|
||||
|
||||
=head1 INTERPRETATION
|
||||
|
||||
This plugin shows the estimated remaining computation time for all CPUs of
|
||||
the machine and the estimated remaining computation time of longest workunit.
|
||||
The estimation is based on assumption that the workunits of different lengths
|
||||
will be distributed to the CPUs evenly (which is not always the case).
|
||||
|
||||
The warning level can be used to warn in forward about the risk of workunits
|
||||
local cache depletion and BOINC client running out of the work.
|
||||
Although such warning can be achieved by configuring Munin master, there is
|
||||
also this option to configure it on munin-node side.
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
=head2 Local BOINC Example
|
||||
|
||||
BOINC is running on local machine. The BOINC binaries are installed in
|
||||
F</opt/boinc/custom-6.10.1/>, the BOINC is running in directory
|
||||
F</usr/local/boinc/> under username boinc, group boinc and the password is used
|
||||
to protect access to BOINC.
|
||||
Warning will be set when estimated work for any of CPUs will decrease under
|
||||
48 hours:
|
||||
|
||||
[boinc_*]
|
||||
group boinc
|
||||
env.boinccmd /opt/boinc/custom-6.10.1/boinccmd
|
||||
env.boincdir /usr/local/boinc
|
||||
env.warn 48
|
||||
|
||||
=head2 Remote BOINC Example
|
||||
|
||||
BOINC is running on 2 remote machines C<foo> and C<bar>.
|
||||
On the local machine the binary of command-line interface is installed in
|
||||
directory F</usr/local/bin/>.
|
||||
The BOINC password used on the remote machine C<foo> is stored in file
|
||||
F</etc/munin/boinc/foo/gui_rpc_auth.cfg>.
|
||||
The BOINC password used on the remote machine C<bar> is stored in file
|
||||
F</etc/munin/boinc/bar/gui_rpc_auth.cfg>.
|
||||
These files are owned and readable by root, readable by group munin and not
|
||||
readable by others.
|
||||
There are 2 symbolic links to this plugin created in the munin plugins
|
||||
directory (usually F</etc/munin/plugins/>): F<snmp_foo_boincestwk> and
|
||||
F<snmp_bar_boincestwk>
|
||||
|
||||
[snmp_foo_boinc*]
|
||||
group munin
|
||||
env.boinccmd /usr/local/bin/boinccmd
|
||||
env.host foo
|
||||
env.boincdir /etc/munin/boinc/foo
|
||||
|
||||
[snmp_bar_boinc*]
|
||||
group munin
|
||||
env.boinccmd /usr/local/bin/boinccmd
|
||||
env.host bar
|
||||
env.boincdir /etc/munin/boinc/bar
|
||||
|
||||
This way the plugin can be used by Munin the same way as the Munin plugins
|
||||
utilizng SNMP (although this plugin itself does not use SNMP).
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
The estimation is based on simple assumption, that longest workunits will be
|
||||
processed first. This is the case when work is distributed evenly among CPUs.
|
||||
But this is not always the case, because various deadlines for various
|
||||
workunits may fire the "panic mode" of BOINC and scheduling could be much
|
||||
different.
|
||||
For example, there can be 4 CPUs, and BOINC having downloaded 4 workunits
|
||||
with estimated run-time 1 hour each and 3 workunits with estimated run-time
|
||||
4 hours each.
|
||||
This Munin plugin will report estimated work 4 hours for each CPU.
|
||||
But if deadline of those 1-hour workunits will be much shorter than deadline
|
||||
of those 4-hours workunits, BOINC will schedule short workunits first (for all
|
||||
4 CPUs) and after finishing them it will schedule those long workunits.
|
||||
This will result in real computation for 5 hours on 3 CPUs but only 1 hour on
|
||||
remaining 4th CPU. So after 1 hour of computation 1 of CPUs will run out of
|
||||
work.
|
||||
|
||||
There is no C<autoconf> capability at the moment. This is due to the fact, that
|
||||
BOINC installations may vary over different systems, sometimes using default
|
||||
directory from distribution (e.g. F</var/lib/boinc/> in Debian or Ubuntu), but
|
||||
often running in user directories or in other separate directories.
|
||||
Also the user-ID under which BOINC runs often differs.
|
||||
Under these circumstances the C<autoconf> would be either lame or too
|
||||
complicated.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Palo M. <palo.gm@gmail.com>
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
GPLv3 L<http://www.gnu.org/licenses/gpl-3.0.txt>
|
||||
|
||||
=cut
|
||||
|
||||
# vim:syntax=perl
|
Loading…
Add table
Add a link
Reference in a new issue