#!/bin/bash : << =cut =head1 NAME pressure - Plugin to monitor the pressure stall information for CPU, Memory and IO as reported by the Linux kernel. This plugin monitors the pressure stall information (psi) as reported by the Linux Kernel. By default it reports all average intervals (10 seconds, 60 seconds and 300 seconds) as well as the total values as a rate of change (DERIVE) for all resources (cpu, memory, io). The average intervals can be configured if you only deem some of them useful. See CONFIGURATION for explanations on that. This is a multigraph plugin that, by default, will create six detail graphs and one summary graph (so seven in total). The summary graph will contain the 300 seconds average percentages of all resources. The detail graphs are split in two graphs per resource. One combining all average intervals and one for the "totals" (rate of change) for the given resource. There are no defaults for warnings and criticals, because this highly depends on the system, so you need to configure them yourself (if you want any). It is recommended that you first lookup the meaning of the different values. For more information on psi see: https://www.kernel.org/doc/html/latest/accounting/psi.html =head1 CONFIGURATION Simply create a symlink in your plugins directory like with any other plugin. No additional configuration needed, no specific user required (typically). If you want to configure alerts, just add "warn_" or "crit_" in front of the internal name. Optional configuration examples: [pressure] env.resources cpu io memory - Specify the resources to monitor. Leave one out if you don't want this one to be monitored. env.intervals avg10 avg60 avg300 - Sepcify the average intervals to monitor. Leave one out if you don't want this one to be monitored env.scopes some full - Specify the scopes to monitor. Leave one out If you don't want it to be monitored. env.summary_interval avg300 - Specify the interval to be used for the summary-graph. env.warn_psi_cpu_avg300_some 5 - Set a warning-level of 5 for "psi_cpu_avg300_some" env.crit_psi_io_total_full 2000 - Set a critical-level of 2000 for "psi_io_total_full" =head1 AUTHOR 2022, HaseHarald =head1 LICENSE LGPLv3 =head1 BUGS =head1 TODO =head1 MAGIC MARKERS #%# family=auto #%# capabilities=autoconf =cut # This file contains a munin-plugin to graph the psi (pressure) for CPU, Memory # and IO, as reported by the Linux kernel. # # This is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this plugin. If not, see . resource_defaults=('cpu' 'io' 'memory') interval_defaults=('avg10' 'avg60' 'avg300') scope_defaults=('some' 'full') pressure_dir=${pressure_dir:-'/proc/pressure/'} pressure_resources=${resources[@]:-${resource_defaults[@]}} pressure_intervals=${intervals[@]:-${interval_defaults[@]}} pressure_scopes=${scopes[@]:-${scope_defaults[@]}} summary_interval=${summary_interval:-avg300} check_autoconf() { if [ -d "${pressure_dir}" ]; then printf "yes\n" else printf "no (%s not found)\n" ${pressure_dir} fi } get_pressure_value() { resource=$1 interval=$2 scope=${3:-some} grep "$scope" ${pressure_dir}//${resource} | grep -o -E "${interval}=[0-9]{1,}(\.[0-9]{1,}){0,1}" | cut -d '=' -f 2 } get_printable_name() { kind=$1 value=$2 case $kind in interval) case $interval in avg10) printable_name="10sec" ;; avg60) printable_name="60sec" ;; avg300) printable_name="5min" ;; total) printable_name="Total" ;; *) printf "ERROR: Could not determine interval %s ! Must be one of 'avg10' 'avg60' 'avg300' 'total'\n" $value >&2 exit 2 ;; esac ;; scope) case $value in some) printable_name="Some" ;; full) printable_name="Full" ;; *) printf "ERROR: Could not determine scope %s ! Must be one of 'full' 'some'.\n" $value >&2 exit 2 ;; esac ;; resource) case $value in cpu) printable_name="CPU" ;; io) printable_name="IO" ;; memory) printable_name="Memory" ;; *) printf "ERROR: Could not determine resource-type %s ! Must be one of 'cpu' 'io' 'memory'.\n" $value >&2 exit 2 ;; esac ;; *) printf "ERROR: Could not determine kind %s ! Must be one of 'interval' 'scope' 'resource'\n" $kind >&2 exit 2 ;; esac printf "%s\n" $printable_name } iterate_config() { for resource in ${pressure_resources[@]}; do printable_resource=$( get_printable_name resource $resource ) printf "multigraph pressure.%s_avg\n" $resource printf "graph_title %s Pressure Stall Information - Average\n" $printable_resource printf "graph_category system\n" printf "graph_info Average PSI based latency caused by lack of %s resources.\n" $printable_resource printf "graph_vlabel %%\n" printf "graph_scale no\n" for interval in ${pressure_intervals[@]}; do printable_interval=$( get_printable_name interval $interval ) output_config $resource $interval done echo "" done for resource in ${pressure_resources[@]}; do printable_resource=$( get_printable_name resource $resource ) printf "multigraph pressure.%s_total\n" $resource printf "graph_title %s Pressure Stall Information - Rate\n" $printable_resource printf "graph_category system\n" printf "graph_info Total PSI based latency rate caused by lack of %s resources.\n" $printable_resource printf "graph_vlabel rate\n" interval="total" output_config $resource $interval echo "" done printf "multigraph pressure\n" printf "graph_title Pressure Stall Information - Average\n" printf "graph_vlabel %%\n" printf "graph_scale no\n" printf "graph_category system\n" printf "graph_info Average PSI based latency caused by lack of resources.\n" for resource in ${pressure_resources[@]}; do output_config $resource $summary_interval done echo "" } iterate_values() { for resource in ${pressure_resources[@]}; do printf "multigraph pressure.%s_avg\n" $resource for interval in ${pressure_intervals[@]}; do output_values $resource $interval done echo "" done for resource in ${pressure_resources[@]}; do printf "multigraph pressure.%s_total\n" $resource interval="total" output_values $resource $interval echo "" done printf "multigraph pressure\n" for resource in ${pressure_resources[@]}; do output_values $resource $summary_interval done echo "" } output_config() { resource=$1 interval=$2 printable_resource=$( get_printable_name resource $resource ) printable_interval=$( get_printable_name interval $interval ) for scope in ${pressure_scopes[@]}; do if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then continue else printable_scope=$( get_printable_name scope $scope ) printf "psi_%s_%s_%s.min 0\n" $resource $interval $scope printf "psi_%s_%s_%s.label %s %s %s\n" $resource $interval $scope $printable_resource $printable_interval $printable_scope this_warn_var=$( echo "warn_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' ) if [ -n "${!this_warn_var}" ]; then printf "psi_%s_%s_%s.warning %s\n" $resource $interval $scope ${!this_warn_var} fi this_crit_var=$( echo "crit_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' ) if [ -n "${!this_crit_var}" ]; then printf "psi_%s_%s_%s.critical %s\n" $resource $interval $scope ${!this_crit_var} fi if [ $interval == "total" ]; then printf "psi_%s_%s_%s.type DERIVE\n" $resource $interval $scope fi fi done } output_values() { resource=$1 interval=$2 for scope in ${pressure_scopes[@]}; do if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then continue else printf "psi_%s_%s_%s.value %s\n" $resource $interval $scope $(get_pressure_value $resource $interval $scope) fi done } output_usage() { printf >&2 "%s - munin plugin to graph pressure stall information for CPU, Memory and IO as reported by the Linux kernel.\n" ${0##*/} printf >&2 "Usage: %s [config]\n" ${0##*/} printf >&2 "You may use environment settings in a plugin-config file, used by munin (for example /etc/munin/plugin-conf.d/munin-node) to further adjust settings.\n" printf >&2 "You can use these settings to configure which resources, intervals or scopes are monitored or to configure warning and critical levels.\n" printf >&2 "To do so use a syntax like this:\n" printf >&2 "[pressure]\n" printf >&2 "env.resources cpu io memory\n" printf >&2 "env.intervals avg10 avg60 avg300\n" printf >&2 "env.scopes some full\n" printf >&2 "env.summary_interval avg300\n" printf >&2 "env.warn_psi_cpu_avg300_some 5\n" printf >&2 "env.crit_psi_io_total_full 2000\n" } case $# in 0) iterate_values ;; 1) case $1 in auto|autoconf) check_autoconf ;; config) iterate_config ;; *) output_usage exit 1 ;; esac ;; *) output_usage exit 1 ;; esac