From 5389b09abe53c43935ee2775d9106b8bc0fc0d00 Mon Sep 17 00:00:00 2001 From: HaseHarald Date: Mon, 21 Mar 2022 18:12:43 +0100 Subject: [PATCH] [system/pressure] Add plugin to monitor pressure stall information (psi) Adds a plugin to monitor the pressure stall information (psi) as reported by the Linux kernel. - groups averages per resource - rate/derive totals for ease of reading - resources, intervals and scopes configurable See: https://www.kernel.org/doc/html/latest/accounting/psi.html --- plugins/system/pressure | 331 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100755 plugins/system/pressure diff --git a/plugins/system/pressure b/plugins/system/pressure new file mode 100755 index 00000000..84cbd0d5 --- /dev/null +++ b/plugins/system/pressure @@ -0,0 +1,331 @@ +#!/bin/bash + + +: << =cut + +=head1 NAME + +pressure - Plugin to monitor the pressure stall information for CPU, Memory and +IO as reported by the Linux kernel. + +This plugin monitors the pressure stall information (psi) as reported by the +Linux Kernel. By default it reports all average intervals (10 seconds, +60 seconds and 300 seconds) as well as the total values as a rate of change +(DERIVE) for all resources (cpu, memory, io). The average intervals can be +configured if you only deem some of them useful. See CONFIGURATION for +explanations on that. + +This is a multigraph plugin that, by default, will create six detail graphs and +one summary graph (so seven in total). The summary graph will contain the 300 +seconds average percentages of all resources. The detail graphs are split in two +graphs per resource. One combining all average intervals and one for the +"totals" (rate of change) for the given resource. + +There are no defaults for warnings and criticals, because this highly depends on +the system, so you need to configure them yourself (if you want any). It is +recommended that you first lookup the meaning of the different values. + +For more information on psi see: +https://www.kernel.org/doc/html/latest/accounting/psi.html + +=head1 CONFIGURATION + +Simply create a symlink in your plugins directory like with any other plugin. +No additional configuration needed, no specific user required (typically). + +If you want to configure alerts, just add "warn_" or "crit_" in front of the +internal name. + +Optional configuration examples: + +[pressure] +env.resources cpu io memory - Specify the resources to monitor. Leave one + out if you don't want this one to be + monitored. +env.intervals avg10 avg60 avg300 - Sepcify the average intervals to monitor. + Leave one out if you don't want this one to + be monitored +env.scopes some full - Specify the scopes to monitor. Leave one out + If you don't want it to be monitored. +env.summary_interval avg300 - Specify the interval to be used for the + summary-graph. +env.warn_psi_cpu_avg300_some 5 - Set a warning-level of 5 for + "psi_cpu_avg300_some" +env.crit_psi_io_total_full 2000 - Set a critical-level of 2000 for + "psi_io_total_full" + +=head1 AUTHOR + +2022, HaseHarald + +=head1 LICENSE + +LGPLv3 + +=head1 BUGS + +=head1 TODO + +=head1 MAGIC MARKERS + + #%# family=auto + #%# capabilities=autoconf + +=cut + + +# This file contains a munin-plugin to graph the psi (pressure) for CPU, Memory +# and IO, as reported by the Linux kernel. +# +# This is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this plugin. If not, see . + + +resource_defaults=('cpu' 'io' 'memory') +interval_defaults=('avg10' 'avg60' 'avg300') +scope_defaults=('some' 'full') +pressure_dir=${pressure_dir:-'/proc/pressure/'} +pressure_resources=${resources[@]:-${resource_defaults[@]}} +pressure_intervals=${intervals[@]:-${interval_defaults[@]}} +pressure_scopes=${scopes[@]:-${scope_defaults[@]}} +summary_interval=${summary_interval:-avg300} + +check_autoconf() { + if [ -d "${pressure_dir}" ]; then + printf "yes\n" + else + printf "no (%s not found)\n" ${pressure_dir} + fi +} + +get_pressure_value() { + resource=$1 + interval=$2 + scope=${3:-some} + grep "$scope" ${pressure_dir}//${resource} | grep -o -E "${interval}=[0-9]{1,}(\.[0-9]{1,}){0,1}" | cut -d '=' -f 2 +} + +get_printable_name() { + kind=$1 + value=$2 + + case $kind in + + interval) + case $interval in + avg10) + printable_name="10sec" + ;; + avg60) + printable_name="60sec" + ;; + avg300) + printable_name="5min" + ;; + total) + printable_name="Total" + ;; + *) + printf "ERROR: Could not determine interval %s ! Must be one of 'avg10' 'avg60' 'avg300' 'total'\n" $value >&2 + exit 2 + ;; + esac + ;; + + scope) + case $value in + some) + printable_name="Some" + ;; + full) + printable_name="Full" + ;; + *) + printf "ERROR: Could not determine scope %s ! Must be one of 'full' 'some'.\n" $value >&2 + exit 2 + ;; + esac + ;; + + resource) + case $value in + cpu) + printable_name="CPU" + ;; + io) + printable_name="IO" + ;; + memory) + printable_name="Memory" + ;; + *) + printf "ERROR: Could not determine resource-type %s ! Must be one of 'cpu' 'io' 'memory'.\n" $value >&2 + exit 2 + ;; + esac + ;; + + *) + printf "ERROR: Could not determine kind %s ! Must be one of 'interval' 'scope' 'resource'\n" $kind >&2 + exit 2 + ;; + esac + + printf "%s\n" $printable_name +} + +iterate_config() { + for resource in ${pressure_resources[@]}; do + printable_resource=$( get_printable_name resource $resource ) + printf "multigraph pressure.%s_avg\n" $resource + printf "graph_title %s Pressure Stall Information - Average\n" $printable_resource + printf "graph_category system\n" + printf "graph_info Average PSI based latency caused by lack of %s resources.\n" $printable_resource + printf "graph_vlabel %%\n" + printf "graph_scale no\n" + for interval in ${pressure_intervals[@]}; do + printable_interval=$( get_printable_name interval $interval ) + output_config $resource $interval + done + echo "" + done + + for resource in ${pressure_resources[@]}; do + printable_resource=$( get_printable_name resource $resource ) + printf "multigraph pressure.%s_total\n" $resource + printf "graph_title %s Pressure Stall Information - Rate\n" $printable_resource + printf "graph_category system\n" + printf "graph_info Total PSI based latency rate caused by lack of %s resources.\n" $printable_resource + printf "graph_vlabel rate\n" + interval="total" + output_config $resource $interval + echo "" + done + + printf "multigraph pressure\n" + printf "graph_title Pressure Stall Information - Average\n" + printf "graph_vlabel %%\n" + printf "graph_scale no\n" + printf "graph_category system\n" + printf "graph_info Average PSI based latency caused by lack of resources.\n" + for resource in ${pressure_resources[@]}; do + output_config $resource $summary_interval + done + echo "" +} + +iterate_values() { + for resource in ${pressure_resources[@]}; do + printf "multigraph pressure.%s_avg\n" $resource + for interval in ${pressure_intervals[@]}; do + output_values $resource $interval + done + echo "" + done + + for resource in ${pressure_resources[@]}; do + printf "multigraph pressure.%s_total\n" $resource + interval="total" + output_values $resource $interval + echo "" + done + + printf "multigraph pressure\n" + for resource in ${pressure_resources[@]}; do + output_values $resource $summary_interval + done + echo "" +} + +output_config() { + resource=$1 + interval=$2 + + printable_resource=$( get_printable_name resource $resource ) + printable_interval=$( get_printable_name interval $interval ) + + for scope in ${pressure_scopes[@]}; do + if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then + continue + else + printable_scope=$( get_printable_name scope $scope ) + printf "psi_%s_%s_%s.min 0\n" $resource $interval $scope + printf "psi_%s_%s_%s.label %s %s %s\n" $resource $interval $scope $printable_resource $printable_interval $printable_scope + this_warn_var=$( echo "warn_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' ) + if [ -n "${!this_warn_var}" ]; then + printf "psi_%s_%s_%s.warning %s\n" $resource $interval $scope ${!this_warn_var} + fi + this_crit_var=$( echo "crit_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' ) + if [ -n "${!this_crit_var}" ]; then + printf "psi_%s_%s_%s.critical %s\n" $resource $interval $scope ${!this_crit_var} + fi + if [ $interval == "total" ]; then + printf "psi_%s_%s_%s.type DERIVE\n" $resource $interval $scope + fi + fi + done +} + +output_values() { + resource=$1 + interval=$2 + + for scope in ${pressure_scopes[@]}; do + if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then + continue + else + printf "psi_%s_%s_%s.value %s\n" $resource $interval $scope $(get_pressure_value $resource $interval $scope) + fi + done +} + +output_usage() { + printf >&2 "%s - munin plugin to graph pressure stall information for CPU, Memory and IO as reported by the Linux kernel.\n" ${0##*/} + printf >&2 "Usage: %s [config]\n" ${0##*/} + printf >&2 "You may use environment settings in a plugin-config file, used by munin (for example /etc/munin/plugin-conf.d/munin-node) to further adjust settings.\n" + printf >&2 "You can use these settings to configure which resources, intervals or scopes are monitored or to configure warning and critical levels.\n" + printf >&2 "To do so use a syntax like this:\n" + printf >&2 "[pressure]\n" + printf >&2 "env.resources cpu io memory\n" + printf >&2 "env.intervals avg10 avg60 avg300\n" + printf >&2 "env.scopes some full\n" + printf >&2 "env.summary_interval avg300\n" + printf >&2 "env.warn_psi_cpu_avg300_some 5\n" + printf >&2 "env.crit_psi_io_total_full 2000\n" +} + +case $# in + 0) + iterate_values + ;; + + 1) + case $1 in + auto|autoconf) + check_autoconf + ;; + config) + iterate_config + ;; + *) + output_usage + exit 1 + ;; + esac + ;; + + *) + output_usage + exit 1 + ;; +esac