1
0
Fork 0
mirror of https://github.com/munin-monitoring/contrib.git synced 2025-07-21 02:33:18 +00:00

feat: add cpu_by_group plugin (#1468)

* feat: add cpu_by_group plugin

The current cpu_by_process plugin says it self it can generate huge graphs which are difficult to decipher. This plugin tries to mitigate that issue by automatically grouping processes by their cgroup. This also solves the issue that cpu_by_process groups processes by process name, which means atm you cant really differentiate between f.e. python processes in various cgroups. The remaining processes that are not running in a cgroup are also split between kernel threads and system processes

For user & docker cgroups a human readable cgroup label is constructed (if munin has f.e. docker access). The  function in this plugin could probably be expanded for other often used cgroups

* chore: add example graphs & update docs

* chore: update docs
This commit is contained in:
Pim 2025-01-07 09:11:48 +01:00 committed by GitHub
parent 50c90dec07
commit 67024140dd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 360 additions and 0 deletions

360
plugins/cpu/cpu_by_group Executable file
View file

@ -0,0 +1,360 @@
#!/bin/bash
# -*- sh -*-
set -e
: << =cut
=head1 NAME
cpu_by_group - Monitors cpu time for all processes on a system and groups them into graphs by either cgroup, system processes or kernel threads
=head1 DESCRIPTION
Works similar as good ol' cpu_by_process, but groups processes by either their cgroup, system proceses or kernel threads to produce smaller graphs.
For docker projects / containers the processes are grouped by either their docker compose project name or container name.
For user cgroups the user id will be resolved to the corresponding user name.
The plugin keeps track of previously running processes in MUNIN_STATEFILE
=head2 EXAMPLE GRAPHS
There are 3 example graphs which were all automatically generated by this plugin.
- System-day -> system processes (processes without a cgroup)
- KThread-day -> kernel threads
- caddy-day -> A docker compose project named caddy, with a single container named caddy and a single caddy process
=head1 CONFIGURATION
For full docker support, ensure that munin can access docker. F.e. run munin with the docker group:
[cpu_by_group]
group docker
=head1 REQUIREMENTS
- bash v4.3+
- docker access rights (if using docker)
=head1 AUTHOR
Copyright (C) 2025 pimlie
=head1 LICENSE
MIT
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf
=cut
if [ -n "$MUNIN_LIBDIR" ]; then
. "$MUNIN_LIBDIR/plugins/plugin.sh"
fi
PLUGIN_BASE="$(basename "$0")"
PLUGIN_CACHE="$MUNIN_STATEFILE"
# Check if docker can be used
function can_use_docker {
if command -v docker >/dev/null; then
if docker ps >/dev/null 2>&1; then
echo 1
return 0
fi
fi
echo 0
return 1
}
HAS_DOCKER="$(can_use_docker)" # is a bool, 1=true
# Left trim white spaces
function ltrim {
local var="$*"
# remove leading whitespace characters
var="${var#"${var%%[![:space:]]*}"}"
echo "$var"
}
#
# Create a munin variable safe & human readable cgroup name
#
# Supported human readable names:
# - 'User $user_name' for isolated user processes
# - For docker containers
# - Group by compose.service label (the docker project name) if applicable
# - Add container name, either as group if not docker compose otherwise as graph prefix
#
declare -A docker_cache
function safe_cgroup_name {
local -n local_cgroup_name=$1
local cgroup="$2"
# shellcheck disable=SC2076
if [[ "$cgroup" != *.scope ]] || [[ "$cgroup" == */init.scope ]]; then
# if no cgroup scope, just list as system
cgroup="System"
# Unless it's a kernel thread
if [ -r "/proc/$3/stat" ]; then
IFS=' ' read -ra stats < "/proc/$3/stat"
local flags
flags=${stats[8]}
pkThread=0x00200000
if (( (flags & pkThread) == pkThread )); then
cgroup="KThread"
fi
fi
elif [[ "$cgroup" =~ "/user.slice/user-" ]]; then
# Extract user id and use user name as cgroup name
user_id=${cgroup#*/user.slice/user-}
user_id=${user_id%%.*}
user="$(id -nu "$user_id")"
cgroup="User-${user}"
elif [ "$HAS_DOCKER" -eq 1 ] && [[ "$cgroup" =~ "/docker-" ]]; then
# Extract docker container id and use either compose project otherwise
# container name as cgroup name
docker_id=${cgroup#*/docker-}
docker_id=${docker_id%%.*}
docker_data="${docker_cache["$docker_id"]}"
if [ -z "$docker_data" ]; then
docker_data="$(docker inspect --format='{{ .Name }} {{ index .Config.Labels "com.docker.compose.project" }}' "$docker_id")"
docker_cache["$docker_id"]="$docker_data"
fi
IFS=' ' read -ra docker_names <<< "$docker_data"
image_title=${docker_names[0]//*\//}
compose_project="${docker_names[1]}"
if [ -n "$compose_project" ]; then
cgroup="$compose_project.$image_title"
elif [ -n "$image_title" ]; then
cgroup="$image_title"
fi
else
cgroup=${cgroup##*/}
cgroup=${cgroup%%.*}
fi
# shellcheck disable=SC2034
local_cgroup_name="${cgroup//_/-}"
}
# Format process name as munin safe variable
function safe_proc_name {
local -n local_name=$1
local name="$2"
name="${name%%/*}" # Remove everything after a /
name="${name%.}" # Remove trailing dot
name="${name//[^a-zA-Z0-9]/_}" # Make Munin var safe
# shellcheck disable=SC2034
local_name="$name"
}
# Format graph name as munin safe variable
function safe_graph_name {
name="${1:-cputime}" # Use cputime as default value if unset
name="${name//|/.}" # Replace any pipes to dots
echo "${name//./_}" # Replace dots with underscores
}
# Calculate process time in seconds
function calc_proc_time {
local -n local_proc_time=$1
IFS=":" read -ra time <<< "$2"
if [ "${#time[@]}" -ne 3 ]; then
echo "Expected time to be in ([days]-)[hour]:[min]:[sec] format, got '$1'" >&2
exit 1
fi
local day=0
local hour="${time[0]}"
IFS="-" read -ra dayhour <<< "${time[0]}"
if [ "${#dayhour[@]}" -gt 1 ]; then
day="${dayhour[0]}"
hour="${dayhour[1]}"
fi
local hours="$(( day * 24 + 10#$hour ))"
# shellcheck disable=SC2034
local_proc_time="$(( 10#$hours * 3600 + 10#${time[1]} * 60 + 10#${time[2]}))"
}
# Get array of all previous & current running processes
function get_processes {
local -n procs=$1
if [ -n "$PLUGIN_CACHE" ] && [ -r "$PLUGIN_CACHE" ] && [ -s "$PLUGIN_CACHE" ]; then
while read -r process; do
procs["$process"]=0;
done < "$PLUGIN_CACHE"
fi
IFS=$'\n'
for proc_line in $(ps -eo time,pid,comm h); do
IFS=' ' read -r -a proc <<< "$proc_line"
# Create dot separated cgroup / process name breadcrumb
local proc_id
local cgroup_file
proc_id="$(ltrim "${proc[1]}")"
cgroup_file="/proc/$proc_id/cgroup"
# cat cgroup directly from /proc fs, as ps is bad for parsing multiple variable
# width columns and trimming whitespace is quite slow in bash
local cgroup_name=""
if [ -r "$cgroup_file" ]; then
cgroup_name="$(cat "$cgroup_file")"
fi
proc_name="${proc[2]}"
proc_time=0
safe_cgroup_name cgroup_name "$cgroup_name" "$proc_id"
safe_proc_name proc_name "$proc_name"
# Use everything before the first dot as the cgroup name, and everything
# after the first dot as process name.
# This ensures that we can also use prefixes within cgroup's, fe
# for docker containers we want the compose projects as cgroup name
# but still prefix individual processes with their container name
local breadcrumb
local multi_name
local multi_graph_name
breadcrumb="${cgroup_name}.${proc_name}"
multi_name="${breadcrumb%%.*}"
multi_graph_name="${breadcrumb#*.}"
calc_proc_time proc_time "${proc[0]}"
if [ "$proc_time" -gt 0 ]; then
procs_key="${multi_name}|${multi_graph_name}"
cur_value="${procs["$procs_key"]}"
procs["$procs_key"]="$(( cur_value + proc_time ))"
fi
done
unset IFS
}
function cache_processes {
local -n proc_names=$1
if [ -n "$PLUGIN_CACHE" ] && [ -w "$(dirname "$PLUGIN_CACHE")" ]; then
echo "${proc_names[@]}" | tr ' ' '\n' > "$PLUGIN_CACHE"
fi
}
# Emit multigraph header
function emit_multigraph_base {
if [ -z "$1" ]; then
echo "multigraph $PLUGIN_BASE"
else
echo "multigraph ${PLUGIN_BASE}_${1//-/_}"
fi
}
# Emit base graph config
function emit_graph_base {
cat <<EOF
graph_title CPU time for ${1//-/ } processes
graph_args --base 1000
graph_vlabel seconds
graph_category processes
graph_info Shows CPU time used by each process name
EOF
}
# Emit graph config, default = STACK
function emit_graph_config {
local graph_name
local label
graph_name="$(safe_graph_name "$1")"
label="${1:-CPU time}"
cat <<EOF
$graph_name.label $label
$graph_name.min 0
$graph_name.type DERIVE
$graph_name.draw ${2:-STACK}
EOF
}
# Emit value for process
function emit_value {
local graph_name
graph_name="$(safe_graph_name "$1")"
echo "${graph_name}.value $2"
}
# Check if string equals config
function is_config {
if [ "$1" == "config" ]; then
return 0
fi
return 1
}
# Check if new cgroup is different from current and if so update
function is_new_cgroup {
local -n cur_cgroup=$1
local new_cgroup=$2
if [ -z "$cur_cgroup" ] || [ "$new_cgroup" != "$cur_cgroup" ]; then
cur_cgroup=$new_cgroup
return 0
fi
return 1
}
case $1 in
autoconf)
echo "yes"
;;
*)
declare -A processes
get_processes processes
# shellcheck disable=SC2207
IFS=$'\n' process_names=($(sort -f <<<"${!processes[*]}")); unset IFS
cgroup=""
for process_name in "${process_names[@]}"; do
IFS="|" read -ra proc <<< "$process_name"
if is_new_cgroup cgroup "${proc[0]}"; then
emit_multigraph_base "$cgroup"
if is_config "$1"; then
emit_graph_base "$cgroup"
type="AREA"
fi
else
type="STACK"
fi
if is_config "$1"; then
emit_graph_config "${proc[1]}" "$type"
else
emit_value "${proc[1]}" "${processes[$process_name]}"
fi
done
cache_processes process_names
;;
esac

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB