#!/bin/bash : << =cut =head1 NAME linux_psi - Plugin to monitor the pressure stall information for CPU, Memory and IO as reported by the Linux kernel. This plugin monitors the pressure stall information (psi) as reported by the Linux Kernel. By default it reports all average intervals (10 seconds, 60 seconds and 300 seconds) as well as the total values as a rate of change (DERIVE) for all resources (cpu, memory, io). The average intervals can be configured if you only deem some of them useful. See CONFIGURATION for explanations on that. This is a multigraph plugin that, by default, will create six detail graphs and one summary graph (so seven in total). The summary graph will contain the 300 seconds average percentages of all resources. The detail graphs are split in two graphs per resource. One combining all average intervals and one for the "totals" (rate of change) for the given resource. There are no defaults for warnings and criticals, because this highly depends on the system, so you need to configure them yourself (if you want any). It is recommended that you first lookup the meaning of the different values. For more information on psi see: https://www.kernel.org/doc/html/latest/accounting/psi.html =head1 CONFIGURATION Simply create a symlink in your plugins directory like with any other plugin. No additional configuration needed, no specific user required (typically). If you want to configure alerts, just add "warn_" or "crit_" in front of the internal name. Optional configuration examples: [linux_psi] env.resources cpu io memory - Specify the resources to monitor. Leave one out if you don't want this one to be monitored. env.intervals avg10 avg60 avg300 - Sepcify the average intervals to monitor. Leave one out if you don't want this one to be monitored env.scopes some full - Specify the scopes to monitor. Leave one out If you don't want it to be monitored. env.summary_interval avg300 - Specify the interval to be used for the summary-graph. env.warn_psi_cpu_avg300_some 5 - Set a warning-level of 5 for "psi_cpu_avg300_some" env.crit_psi_io_total_full 2000 - Set a critical-level of 2000 for "psi_io_total_full" =head1 AUTHOR 2022, HaseHarald =head1 LICENSE LGPLv3 =head1 BUGS =head1 TODO =head1 MAGIC MARKERS #%# family=auto #%# capabilities=autoconf =cut # This file contains a munin-plugin to graph the psi (pressure) for CPU, Memory # and IO, as reported by the Linux kernel. # # This is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this plugin. If not, see . resource_defaults=('cpu' 'io' 'memory') interval_defaults=('avg10' 'avg60' 'avg300') scope_defaults=('some' 'full') pressure_dir=${pressure_dir:-'/proc/pressure/'} pressure_resources=( "${resources[@]:-${resource_defaults[@]}}" ) pressure_intervals=( "${intervals[@]:-${interval_defaults[@]}}" ) pressure_scopes=( "${scopes[@]:-${scope_defaults[@]}}" ) summary_interval="${summary_interval:-avg300}" check_autoconf() { if [ -d "${pressure_dir}" ]; then printf "yes\n" else printf "no (%s not found)\n" "${pressure_dir}" fi } get_pressure_value() { local resource local interval local scope resource="$1" interval="$2" scope="${3:-some}" grep "$scope" "${pressure_dir}/${resource}" | grep -o -E "${interval}=[0-9]{1,}(\.[0-9]{1,}){0,1}" | cut -d '=' -f 2 } get_printable_name() { local kind local value local printable_name kind="$1" value="$2" printable_name="" case "$kind" in interval) case "$interval" in avg10) printable_name="10sec" ;; avg60) printable_name="60sec" ;; avg300) printable_name="5min" ;; total) printable_name="Total" ;; *) printf "ERROR: Could not determine interval %s ! Must be one of 'avg10' 'avg60' 'avg300' 'total'\n" "$value" >&2 exit 2 ;; esac ;; scope) case "$value" in some) printable_name="Some" ;; full) printable_name="Full" ;; *) printf "ERROR: Could not determine scope %s ! Must be one of 'full' 'some'.\n" "$value" >&2 exit 2 ;; esac ;; resource) case "$value" in cpu) printable_name="CPU" ;; io) printable_name="IO" ;; memory) printable_name="Memory" ;; *) printf "ERROR: Could not determine resource-type %s ! Must be one of 'cpu' 'io' 'memory'.\n" "$value" >&2 exit 2 ;; esac ;; *) printf "ERROR: Could not determine kind %s ! Must be one of 'interval' 'scope' 'resource'\n" "$kind" >&2 exit 2 ;; esac printf "%s" "$printable_name" } iterate_config() { for resource in "${pressure_resources[@]}"; do local printable_resource printable_resource=$( get_printable_name resource "$resource" ) printf "multigraph linux_psi.%s_avg\n" "$resource" printf "graph_title %s Pressure Stall Information - Average\n" "$printable_resource" printf "graph_category system\n" printf "graph_info Average PSI based latency caused by lack of %s resources.\n" "$printable_resource" printf "graph_vlabel %%\n" printf "graph_scale no\n" for interval in "${pressure_intervals[@]}"; do local printable_interval printable_interval=$( get_printable_name interval "$interval" ) output_config "$resource" "$interval" done echo "" done for resource in "${pressure_resources[@]}"; do local interval local printable_resource interval="total" printable_resource=$( get_printable_name resource "$resource" ) printf "multigraph linux_psi.%s_total\n" "$resource" printf "graph_title %s Pressure Stall Information - Rate\n" "$printable_resource" printf "graph_category system\n" printf "graph_info Total PSI based latency rate caused by lack of %s resources.\n" "$printable_resource" printf "graph_vlabel rate\n" output_config "$resource" "$interval" echo "" done printf "multigraph linux_psi\n" printf "graph_title Pressure Stall Information - Average\n" printf "graph_vlabel %%\n" printf "graph_scale no\n" printf "graph_category system\n" printf "graph_info Average PSI based latency caused by lack of resources.\n" for resource in "${pressure_resources[@]}"; do output_config "$resource" "$summary_interval" done echo "" } iterate_values() { for resource in "${pressure_resources[@]}"; do printf "multigraph linux_psi.%s_avg\n" "$resource" for interval in "${pressure_intervals[@]}"; do output_values "$resource" "$interval" done echo "" done for resource in "${pressure_resources[@]}"; do local interval interval="total" printf "multigraph linux_psi.%s_total\n" "$resource" output_values "$resource" "$interval" echo "" done printf "multigraph linux_psi\n" for resource in "${pressure_resources[@]}"; do output_values "$resource" "$summary_interval" done echo "" } output_config() { local resource local interval local printable_resource local printable_interval resource="$1" interval="$2" printable_resource=$( get_printable_name resource "$resource" ) printable_interval=$( get_printable_name interval "$interval" ) for scope in "${pressure_scopes[@]}"; do if [ "${resource}" == "cpu" ] && [ "${scope}" != "some" ]; then continue else local printable_scope local this_warn_var local this_crit_var printable_scope=$( get_printable_name scope "$scope" ) this_warn_var=$( echo "warn_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' ) this_crit_var=$( echo "crit_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' ) printf "psi_%s_%s_%s.min 0\n" "$resource" "$interval" "$scope" printf "psi_%s_%s_%s.label %s %s %s\n" "$resource" "$interval" "$scope" "$printable_resource" "$printable_interval" "$printable_scope" if [ -n "${!this_warn_var}" ]; then printf "psi_%s_%s_%s.warning %s\n" "$resource" "$interval" "$scope" "${!this_warn_var}" fi if [ -n "${!this_crit_var}" ]; then printf "psi_%s_%s_%s.critical %s\n" "$resource" "$interval" "$scope" "${!this_crit_var}" fi if [ "$interval" == "total" ]; then printf "psi_%s_%s_%s.type DERIVE\n" "$resource" "$interval" "$scope" fi fi done } output_values() { local resource local interval resource="$1" interval="$2" for scope in "${pressure_scopes[@]}"; do if [ "${resource}" == "cpu" ] && [ "${scope}" != "some" ]; then continue else printf "psi_%s_%s_%s.value %s\n" "$resource" "$interval" "$scope" "$(get_pressure_value "$resource" "$interval" "$scope")" fi done } output_usage() { printf >&2 "%s - munin plugin to graph pressure stall information for CPU, Memory and IO as reported by the Linux kernel.\n" "${0##*/}" printf >&2 "Usage: %s [config]\n" "${0##*/}" printf >&2 "You may use environment settings in a plugin-config file, used by munin (for example /etc/munin/plugin-conf.d/munin-node) to further adjust settings.\n" printf >&2 "You can use these settings to configure which resources, intervals or scopes are monitored or to configure warning and critical levels.\n" printf >&2 "To do so use a syntax like this:\n" printf >&2 "[linux_psi]\n" printf >&2 "env.resources cpu io memory\n" printf >&2 "env.intervals avg10 avg60 avg300\n" printf >&2 "env.scopes some full\n" printf >&2 "env.summary_interval avg300\n" printf >&2 "env.warn_psi_cpu_avg300_some 5\n" printf >&2 "env.crit_psi_io_total_full 2000\n" } case "$#" in 0) iterate_values ;; 1) case "$1" in autoconf) check_autoconf ;; config) iterate_config ;; fetch) iterate_values ;; *) output_usage exit 1 ;; esac ;; *) output_usage exit 1 ;; esac