diff --git a/plugins/other/boinc_estwk b/plugins/other/boinc_estwk new file mode 100755 index 00000000..f2634d00 --- /dev/null +++ b/plugins/other/boinc_estwk @@ -0,0 +1,438 @@ +#!/usr/bin/perl -w +# +# boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs +# +# Run 'perldoc boinc_estwk' for full man page +# +# Author: Palo M. +# License: GPLv3 +# +# +# Parameters supported: +# config +# +# +# Configurable variables +# boinccmd - command-line control program (default: boinccmd) +# host - Host to query (default: none) +# port - GUI RPC port (default: none = use BOINC-default) +# boincdir - Directory containing appropriate password file +# gui_rpc_auth.cfg (default: none) +# estwk_warn - Warning level - minimum estimated work (default: 24.00 hours) +# password - Password for BOINC (default: none) !!! UNSAFE !!! +# +# +# $Log$ +# +# Revision 1.0 2009/09/13 Palo M. +# Add documentation and license information +# Ready to publish on Munin Exchange +# Revision 0.9 2009/09/13 Palo M. +# Add possibility to read password from file +# Revision 0.8 2009/09/12 Palo M. +# Update default binary name: boinc_cmd -> boinccmd +# Revision 0.7 2008/08/30 Palo M. +# Creation - Attempt to port functionality from C++ code +# +# (Revisions 0.1 - 0.6) were done in C++ +# +# +# +# Magic markers: +#%# family=contrib + +use strict; + + +######################################################################### +# 1. Parse configuration variables +# +my $BOINCCMD = exists $ENV{'boinccmd'} ? $ENV{'boinccmd'} : "boinccmd"; +my $HOST = exists $ENV{'host'} ? $ENV{'host'} : undef; +my $PORT = exists $ENV{'port'} ? $ENV{'port'} : undef; +my $PASSWORD = exists $ENV{'password'} ? $ENV{'password'} : undef; +my $BOINCDIR = exists $ENV{'boincdir'} ? $ENV{'boincdir'} : undef; +my $ESTWKWRN = exists $ENV{'estwk_warn'} ? $ENV{'estwk_warn'} : 24; + +######################################################################### +# 2. Basic executable +# +if (defined $HOST) { + $BOINCCMD .= " --host $HOST"; + if (defined $PORT) { + $BOINCCMD .= ":$PORT"; + } +} +if (defined $PASSWORD) { + $BOINCCMD .= " --passwd $PASSWORD"; +} +if (defined $BOINCDIR) { + chdir $BOINCDIR; +} + +######################################################################### +# 3. Get host info, to retrieve number of CPUs +# +my $nCPUs; +my $hostInfo = `$BOINCCMD --get_host_info 2>/dev/null`; +if ($hostInfo ne "") { + my @hostInfo = split /\n/, $hostInfo; + my @nCPUs = grep /^\s+#CPUS: /,@hostInfo; + if ($#nCPUs != 0) { die "Unexpected output from boinccmd"; } + $nCPUs = $nCPUs[0]; + $nCPUs =~ s/^\s+#CPUS: //; + no warnings; # for following line only + if ($nCPUs < 1) { die "Unexpected output from boinccmd"; } +} +else { + # No host info (e.g. client not running) + exit -1; +} + +#print "$nCPUs\n"; + +######################################################################### +# 4. Display config if applicable +# +if ( (defined $ARGV[0]) && ($ARGV[0] eq "config") ) { + + if (defined $HOST) { + print "host_name $HOST\n"; + } + + print "graph_title BOINC work cache estimation\n"; + print "graph_category BOINC\n"; + print "graph_args --base 1000 -l 0 --alt-autoscale-max\n"; + print "graph_vlabel Hours\n"; + print "graph_scale no\n"; + + # Longest WU is AREA, each CPU estimated is LINE2 + print "longest.label Longest WU\n"; + print "longest.draw AREA\n"; + print "longest.type GAUGE\n"; + for (my $i = 0; $i < $nCPUs; ++$i) { + print "cpu$i.label CPU$i\n"; + print "cpu$i.draw LINE2\n"; + print "cpu$i.type GAUGE\n"; + printf "cpu$i.warning %.2f:\n",$ESTWKWRN; + print "cpu$i.critical 0:\n"; + } + + exit 0; +} + +######################################################################### +# 5. Fetch all needed data from BOINC-client with single call +# +my $prj_status = ""; +my $results = ""; + +my $simpleGuiInfo = `$BOINCCMD --get_simple_gui_info 2>/dev/null`; +if ($simpleGuiInfo ne "") { + # Some data were retrieved, so let's split them + my @sections; + my @section1; + @sections = split /=+ Projects =+\n/, $simpleGuiInfo; + @section1 = split /=+ [A-z]+ =+\n/, $sections[1]; + $prj_status = $section1[0]; + + @sections = split /=+ Results =+\n/, $simpleGuiInfo; + @section1 = split /=+ [A-z]+ =+\n/, $sections[1]; + $results = $section1[0]; +} + +######################################################################### +# 6. Parse BOINC data +# +# 6.a) Get suspended projects +my @prjInfos = split /\d+\) -+\n/, $prj_status; +shift @prjInfos; # Throw out first empty line + +my @susp_projects; # array of suspended projects +for my $prj_info (@prjInfos) { + my @lines = split /\n/, $prj_info; + my @prjURL = grep /^\s+master URL: /,@lines; + if ($#prjURL != 0) {die "Unexpected output from boinccmd"; } + my $prjURL =$prjURL[0]; + $prjURL =~ s/^\s+master URL: //; + my @suspGUI = grep /^\s+suspended via GUI: /,@lines; + if ($#suspGUI != 0) {die "Unexpected output from boinccmd"; } + my $suspGUI =$suspGUI[0]; + $suspGUI =~ s/^\s+suspended via GUI: //; + if ($suspGUI eq "yes") { + push @susp_projects, $prjURL + } +} +for my $i (@susp_projects) { print "$i\n"; } + +# 6.b) Parse results, check their states +# Get those which are NOT suspended by GUI +my @rsltInfos = split /\d+\) -+\n/, $results; +shift @rsltInfos; # Throw out first empty line +my @rsltRemain; + +for my $rslt_info (@rsltInfos) { + my @lines = split /\n/, $rslt_info; + my @estRemain = grep /^\s+estimated CPU time remaining: /,@lines; + my $estRemain = $estRemain[0]; + $estRemain =~ s/^\s+estimated CPU time remaining: //; + my @schedstat = grep /^\s+scheduler state: /,@lines; + my $schedstat = $schedstat[0]; + $schedstat =~ s/^\s+scheduler state: //; + my @state = grep /^\s+state: /,@lines; + my $state = $state[0]; + $state =~ s/^\s+state: //; + my @acttask = grep /^\s+active_task_state: /,@lines; + my $acttask = $acttask[0]; + $acttask =~ s/^\s+active_task_state: //; + my @suspGUI = grep /^\s+suspended via GUI: /,@lines; + my $suspGUI =$suspGUI[0]; + $suspGUI =~ s/^\s+suspended via GUI: //; + my @prjURL = grep /^\s+project URL: /,@lines; + my $prjURL =$prjURL[0]; + $prjURL =~ s/^\s+project URL: //; + if ($suspGUI eq "yes") { + # This result is not in work cache - at the moment + next; + } + my @suspPRJ = grep /^$prjURL$/,@susp_projects; + if ($#suspPRJ == 0) { + # This result is not in work cache - at the moment + next; + } + if ($state eq "2") { + # RESULT_FILES_DOWNLOADED + if ( ($schedstat eq "0") || + ($schedstat eq "1") ) { + # CPU_SCHED_UNINITIALIZED 0 + # Not started yet: result is available in work cache + # CPU_SCHED_PREEMPTED 1 + # preempted: result is available in work cache + push @rsltRemain,$estRemain; + next; + } + if ($schedstat eq "2") { + # CPU_SCHED_SCHEDULED 2 + if ( ($acttask eq "1") || + ($acttask eq "0") || + ($acttask eq "9") ) { + # PROCESS_EXECUTING 1 + # running + # PROCESS_UNINITIALIZED 0 + # PROCESS_SUSPENDED 9 + # suspended by "user active"/benchmark? + # available in work cache + push @rsltRemain,$estRemain; + next; + } + # other active-task-state - maybe failing/aborted WU + # => not in work cache + next; + } + # There should be no other scheduler state + next; + } + # RESULT_FILES_DOWNLOADING + # RESULT_COMPUTE_ERROR + # RESULT_FILES_UPLOADING + # RESULT_FILES_UPLOADED + # RESULT_ABORTED + # => not in work cache +} + +######################################################################### +# 7. Distribute remaining results per CPUs +# +# 7.a) Sort remaining results descending +my @sortRemain = sort {$b <=> $a} @rsltRemain; + +# 7.b) Assign to CPU with smallest workcache +my @CPUcache; +for (my $i = 0; $i < $nCPUs; ++$i) { + $CPUcache[$i] = 0; +} + +for my $length (@sortRemain) { + # find CPU with smallest workcache: + my @sortedCPUs = sort {$a <=> $b} @CPUcache; + $sortedCPUs[0] = $sortedCPUs[0] + $length; + @CPUcache = @sortedCPUs; +} + +# At the end, sort CPUs descending +@CPUcache = sort {$b <=> $a} @CPUcache; + +######################################################################### +# 8. Display output +# + +# Convert from seconds to hours +printf "longest.value %.2f\n",$sortRemain[0]/3600; +for (my $i = 0; $i < $nCPUs; ++$i) { + printf "cpu$i.value %.2f\n",$CPUcache[$i]/3600; +} + +exit 0; + + +######################################################################### +# perldoc section + +=head1 NAME + +boinc_estwk - Munin plugin to monitor estimated time of BOINC WUs + +=head1 APPLICABLE SYSTEMS + +Linux machines running BOINC and munin-node + +- or - + +Linux servers (running munin-node) used to collect data from other systems +which are running BOINC, but not running munin-node (e.g. non-Linux systems) + +=head1 CONFIGURATION + +Following configuration variables are supported: + +=over 12 + +=item B + +command-line control program (default: boinccmd) + +=item B + +Host to query (default: none) + +=item B + +GUI RPC port (default: none = use BOINC-default) + +=item B + +Directory containing appropriate file gui_rpc_auth.cfg (default: none) + +=item B + +Warning level - minimum estimated work (default: 24.00 hours) + +=item B + +Password for BOINC (default: none) + +=back + +=head2 B + +Using of variable B poses a security risk. Even if the Munin +configuration file for this plugin containing BOINC-password is properly +protected, the password is exposed as environment variable and finally passed +to boinccmd as a parameter. It is therefore possible for local users of the +machine running this plugin to eavesdrop the BOINC password. + +Using of variable password is therefore strongly discouraged and is left here +as a legacy option and for testing purposes. + +It should be always possible to use B variable instead - in such case +the file gui_rpc_auth.cfg is read by boinccmd binary directly. +If this plugin is used to fetch data from remote system, the gui_rpc_auth.cfg +can be copied to special directory in a secure way (e.g. via scp) and properly +protected by file permissions. + +=head1 INTERPRETATION + +This plugin shows the estimated remaining computation time for all CPUs of +the machine and the estimated remaining computation time of longest workunit. +The estimation is based on assumption that the workunits of different lengths +will be distributed to the CPUs evenly (which is not always the case). + +The warning level can be used to warn in forward about the risk of workunits +local cache depletion and BOINC client running out of the work. +Although such warning can be achieved by configuring Munin master, there is +also this option to configure it on munin-node side. + +=head1 EXAMPLES + +=head2 Local BOINC Example + +BOINC is running on local machine. The BOINC binaries are installed in +F, the BOINC is running in directory +F under username boinc, group boinc and the password is used +to protect access to BOINC. +Warning will be set when estimated work for any of CPUs will decrease under +48 hours: + + [boinc_*] + group boinc + env.boinccmd /opt/boinc/custom-6.10.1/boinccmd + env.boincdir /usr/local/boinc + env.warn 48 + +=head2 Remote BOINC Example + +BOINC is running on 2 remote machines C and C. +On the local machine the binary of command-line interface is installed in +directory F. +The BOINC password used on the remote machine C is stored in file +F. +The BOINC password used on the remote machine C is stored in file +F. +These files are owned and readable by root, readable by group munin and not +readable by others. +There are 2 symbolic links to this plugin created in the munin plugins +directory (usually F): F and +F + + [snmp_foo_boinc*] + group munin + env.boinccmd /usr/local/bin/boinccmd + env.host foo + env.boincdir /etc/munin/boinc/foo + + [snmp_bar_boinc*] + group munin + env.boinccmd /usr/local/bin/boinccmd + env.host bar + env.boincdir /etc/munin/boinc/bar + +This way the plugin can be used by Munin the same way as the Munin plugins +utilizng SNMP (although this plugin itself does not use SNMP). + +=head1 BUGS + +The estimation is based on simple assumption, that longest workunits will be +processed first. This is the case when work is distributed evenly among CPUs. +But this is not always the case, because various deadlines for various +workunits may fire the "panic mode" of BOINC and scheduling could be much +different. +For example, there can be 4 CPUs, and BOINC having downloaded 4 workunits +with estimated run-time 1 hour each and 3 workunits with estimated run-time +4 hours each. +This Munin plugin will report estimated work 4 hours for each CPU. +But if deadline of those 1-hour workunits will be much shorter than deadline +of those 4-hours workunits, BOINC will schedule short workunits first (for all +4 CPUs) and after finishing them it will schedule those long workunits. +This will result in real computation for 5 hours on 3 CPUs but only 1 hour on +remaining 4th CPU. So after 1 hour of computation 1 of CPUs will run out of +work. + +There is no C capability at the moment. This is due to the fact, that +BOINC installations may vary over different systems, sometimes using default +directory from distribution (e.g. F in Debian or Ubuntu), but +often running in user directories or in other separate directories. +Also the user-ID under which BOINC runs often differs. +Under these circumstances the C would be either lame or too +complicated. + +=head1 AUTHOR + +Palo M. + +=head1 LICENSE + +GPLv3 L + +=cut + +# vim:syntax=perl