mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-21 18:41:03 +00:00
Adds a plugin to monitor the pressure stall information (psi) as reported by the Linux kernel. - groups averages per resource - rate/derive totals for ease of reading - resources, intervals and scopes configurable See: https://www.kernel.org/doc/html/latest/accounting/psi.html
331 lines
10 KiB
Bash
Executable file
331 lines
10 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
|
|
: << =cut
|
|
|
|
=head1 NAME
|
|
|
|
pressure - Plugin to monitor the pressure stall information for CPU, Memory and
|
|
IO as reported by the Linux kernel.
|
|
|
|
This plugin monitors the pressure stall information (psi) as reported by the
|
|
Linux Kernel. By default it reports all average intervals (10 seconds,
|
|
60 seconds and 300 seconds) as well as the total values as a rate of change
|
|
(DERIVE) for all resources (cpu, memory, io). The average intervals can be
|
|
configured if you only deem some of them useful. See CONFIGURATION for
|
|
explanations on that.
|
|
|
|
This is a multigraph plugin that, by default, will create six detail graphs and
|
|
one summary graph (so seven in total). The summary graph will contain the 300
|
|
seconds average percentages of all resources. The detail graphs are split in two
|
|
graphs per resource. One combining all average intervals and one for the
|
|
"totals" (rate of change) for the given resource.
|
|
|
|
There are no defaults for warnings and criticals, because this highly depends on
|
|
the system, so you need to configure them yourself (if you want any). It is
|
|
recommended that you first lookup the meaning of the different values.
|
|
|
|
For more information on psi see:
|
|
https://www.kernel.org/doc/html/latest/accounting/psi.html
|
|
|
|
=head1 CONFIGURATION
|
|
|
|
Simply create a symlink in your plugins directory like with any other plugin.
|
|
No additional configuration needed, no specific user required (typically).
|
|
|
|
If you want to configure alerts, just add "warn_" or "crit_" in front of the
|
|
internal name.
|
|
|
|
Optional configuration examples:
|
|
|
|
[pressure]
|
|
env.resources cpu io memory - Specify the resources to monitor. Leave one
|
|
out if you don't want this one to be
|
|
monitored.
|
|
env.intervals avg10 avg60 avg300 - Sepcify the average intervals to monitor.
|
|
Leave one out if you don't want this one to
|
|
be monitored
|
|
env.scopes some full - Specify the scopes to monitor. Leave one out
|
|
If you don't want it to be monitored.
|
|
env.summary_interval avg300 - Specify the interval to be used for the
|
|
summary-graph.
|
|
env.warn_psi_cpu_avg300_some 5 - Set a warning-level of 5 for
|
|
"psi_cpu_avg300_some"
|
|
env.crit_psi_io_total_full 2000 - Set a critical-level of 2000 for
|
|
"psi_io_total_full"
|
|
|
|
=head1 AUTHOR
|
|
|
|
2022, HaseHarald
|
|
|
|
=head1 LICENSE
|
|
|
|
LGPLv3
|
|
|
|
=head1 BUGS
|
|
|
|
=head1 TODO
|
|
|
|
=head1 MAGIC MARKERS
|
|
|
|
#%# family=auto
|
|
#%# capabilities=autoconf
|
|
|
|
=cut
|
|
|
|
|
|
# This file contains a munin-plugin to graph the psi (pressure) for CPU, Memory
|
|
# and IO, as reported by the Linux kernel.
|
|
#
|
|
# This is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU Lesser General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public License
|
|
# along with this plugin. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
resource_defaults=('cpu' 'io' 'memory')
|
|
interval_defaults=('avg10' 'avg60' 'avg300')
|
|
scope_defaults=('some' 'full')
|
|
pressure_dir=${pressure_dir:-'/proc/pressure/'}
|
|
pressure_resources=${resources[@]:-${resource_defaults[@]}}
|
|
pressure_intervals=${intervals[@]:-${interval_defaults[@]}}
|
|
pressure_scopes=${scopes[@]:-${scope_defaults[@]}}
|
|
summary_interval=${summary_interval:-avg300}
|
|
|
|
check_autoconf() {
|
|
if [ -d "${pressure_dir}" ]; then
|
|
printf "yes\n"
|
|
else
|
|
printf "no (%s not found)\n" ${pressure_dir}
|
|
fi
|
|
}
|
|
|
|
get_pressure_value() {
|
|
resource=$1
|
|
interval=$2
|
|
scope=${3:-some}
|
|
grep "$scope" ${pressure_dir}//${resource} | grep -o -E "${interval}=[0-9]{1,}(\.[0-9]{1,}){0,1}" | cut -d '=' -f 2
|
|
}
|
|
|
|
get_printable_name() {
|
|
kind=$1
|
|
value=$2
|
|
|
|
case $kind in
|
|
|
|
interval)
|
|
case $interval in
|
|
avg10)
|
|
printable_name="10sec"
|
|
;;
|
|
avg60)
|
|
printable_name="60sec"
|
|
;;
|
|
avg300)
|
|
printable_name="5min"
|
|
;;
|
|
total)
|
|
printable_name="Total"
|
|
;;
|
|
*)
|
|
printf "ERROR: Could not determine interval %s ! Must be one of 'avg10' 'avg60' 'avg300' 'total'\n" $value >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
;;
|
|
|
|
scope)
|
|
case $value in
|
|
some)
|
|
printable_name="Some"
|
|
;;
|
|
full)
|
|
printable_name="Full"
|
|
;;
|
|
*)
|
|
printf "ERROR: Could not determine scope %s ! Must be one of 'full' 'some'.\n" $value >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
;;
|
|
|
|
resource)
|
|
case $value in
|
|
cpu)
|
|
printable_name="CPU"
|
|
;;
|
|
io)
|
|
printable_name="IO"
|
|
;;
|
|
memory)
|
|
printable_name="Memory"
|
|
;;
|
|
*)
|
|
printf "ERROR: Could not determine resource-type %s ! Must be one of 'cpu' 'io' 'memory'.\n" $value >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
;;
|
|
|
|
*)
|
|
printf "ERROR: Could not determine kind %s ! Must be one of 'interval' 'scope' 'resource'\n" $kind >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
|
|
printf "%s\n" $printable_name
|
|
}
|
|
|
|
iterate_config() {
|
|
for resource in ${pressure_resources[@]}; do
|
|
printable_resource=$( get_printable_name resource $resource )
|
|
printf "multigraph pressure.%s_avg\n" $resource
|
|
printf "graph_title %s Pressure Stall Information - Average\n" $printable_resource
|
|
printf "graph_category system\n"
|
|
printf "graph_info Average PSI based latency caused by lack of %s resources.\n" $printable_resource
|
|
printf "graph_vlabel %%\n"
|
|
printf "graph_scale no\n"
|
|
for interval in ${pressure_intervals[@]}; do
|
|
printable_interval=$( get_printable_name interval $interval )
|
|
output_config $resource $interval
|
|
done
|
|
echo ""
|
|
done
|
|
|
|
for resource in ${pressure_resources[@]}; do
|
|
printable_resource=$( get_printable_name resource $resource )
|
|
printf "multigraph pressure.%s_total\n" $resource
|
|
printf "graph_title %s Pressure Stall Information - Rate\n" $printable_resource
|
|
printf "graph_category system\n"
|
|
printf "graph_info Total PSI based latency rate caused by lack of %s resources.\n" $printable_resource
|
|
printf "graph_vlabel rate\n"
|
|
interval="total"
|
|
output_config $resource $interval
|
|
echo ""
|
|
done
|
|
|
|
printf "multigraph pressure\n"
|
|
printf "graph_title Pressure Stall Information - Average\n"
|
|
printf "graph_vlabel %%\n"
|
|
printf "graph_scale no\n"
|
|
printf "graph_category system\n"
|
|
printf "graph_info Average PSI based latency caused by lack of resources.\n"
|
|
for resource in ${pressure_resources[@]}; do
|
|
output_config $resource $summary_interval
|
|
done
|
|
echo ""
|
|
}
|
|
|
|
iterate_values() {
|
|
for resource in ${pressure_resources[@]}; do
|
|
printf "multigraph pressure.%s_avg\n" $resource
|
|
for interval in ${pressure_intervals[@]}; do
|
|
output_values $resource $interval
|
|
done
|
|
echo ""
|
|
done
|
|
|
|
for resource in ${pressure_resources[@]}; do
|
|
printf "multigraph pressure.%s_total\n" $resource
|
|
interval="total"
|
|
output_values $resource $interval
|
|
echo ""
|
|
done
|
|
|
|
printf "multigraph pressure\n"
|
|
for resource in ${pressure_resources[@]}; do
|
|
output_values $resource $summary_interval
|
|
done
|
|
echo ""
|
|
}
|
|
|
|
output_config() {
|
|
resource=$1
|
|
interval=$2
|
|
|
|
printable_resource=$( get_printable_name resource $resource )
|
|
printable_interval=$( get_printable_name interval $interval )
|
|
|
|
for scope in ${pressure_scopes[@]}; do
|
|
if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then
|
|
continue
|
|
else
|
|
printable_scope=$( get_printable_name scope $scope )
|
|
printf "psi_%s_%s_%s.min 0\n" $resource $interval $scope
|
|
printf "psi_%s_%s_%s.label %s %s %s\n" $resource $interval $scope $printable_resource $printable_interval $printable_scope
|
|
this_warn_var=$( echo "warn_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' )
|
|
if [ -n "${!this_warn_var}" ]; then
|
|
printf "psi_%s_%s_%s.warning %s\n" $resource $interval $scope ${!this_warn_var}
|
|
fi
|
|
this_crit_var=$( echo "crit_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' )
|
|
if [ -n "${!this_crit_var}" ]; then
|
|
printf "psi_%s_%s_%s.critical %s\n" $resource $interval $scope ${!this_crit_var}
|
|
fi
|
|
if [ $interval == "total" ]; then
|
|
printf "psi_%s_%s_%s.type DERIVE\n" $resource $interval $scope
|
|
fi
|
|
fi
|
|
done
|
|
}
|
|
|
|
output_values() {
|
|
resource=$1
|
|
interval=$2
|
|
|
|
for scope in ${pressure_scopes[@]}; do
|
|
if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then
|
|
continue
|
|
else
|
|
printf "psi_%s_%s_%s.value %s\n" $resource $interval $scope $(get_pressure_value $resource $interval $scope)
|
|
fi
|
|
done
|
|
}
|
|
|
|
output_usage() {
|
|
printf >&2 "%s - munin plugin to graph pressure stall information for CPU, Memory and IO as reported by the Linux kernel.\n" ${0##*/}
|
|
printf >&2 "Usage: %s [config]\n" ${0##*/}
|
|
printf >&2 "You may use environment settings in a plugin-config file, used by munin (for example /etc/munin/plugin-conf.d/munin-node) to further adjust settings.\n"
|
|
printf >&2 "You can use these settings to configure which resources, intervals or scopes are monitored or to configure warning and critical levels.\n"
|
|
printf >&2 "To do so use a syntax like this:\n"
|
|
printf >&2 "[pressure]\n"
|
|
printf >&2 "env.resources cpu io memory\n"
|
|
printf >&2 "env.intervals avg10 avg60 avg300\n"
|
|
printf >&2 "env.scopes some full\n"
|
|
printf >&2 "env.summary_interval avg300\n"
|
|
printf >&2 "env.warn_psi_cpu_avg300_some 5\n"
|
|
printf >&2 "env.crit_psi_io_total_full 2000\n"
|
|
}
|
|
|
|
case $# in
|
|
0)
|
|
iterate_values
|
|
;;
|
|
|
|
1)
|
|
case $1 in
|
|
auto|autoconf)
|
|
check_autoconf
|
|
;;
|
|
config)
|
|
iterate_config
|
|
;;
|
|
*)
|
|
output_usage
|
|
exit 1
|
|
;;
|
|
esac
|
|
;;
|
|
|
|
*)
|
|
output_usage
|
|
exit 1
|
|
;;
|
|
esac
|