diff --git a/plugins/cpu/cpu_by_group b/plugins/cpu/cpu_by_group new file mode 100755 index 00000000..1bce15cf --- /dev/null +++ b/plugins/cpu/cpu_by_group @@ -0,0 +1,360 @@ +#!/bin/bash +# -*- sh -*- + +set -e + +: << =cut + +=head1 NAME + +cpu_by_group - Monitors cpu time for all processes on a system and groups them into graphs by either cgroup, system processes or kernel threads + +=head1 DESCRIPTION + +Works similar as good ol' cpu_by_process, but groups processes by either their cgroup, system proceses or kernel threads to produce smaller graphs. + +For docker projects / containers the processes are grouped by either their docker compose project name or container name. +For user cgroups the user id will be resolved to the corresponding user name. + +The plugin keeps track of previously running processes in MUNIN_STATEFILE + +=head2 EXAMPLE GRAPHS + +There are 3 example graphs which were all automatically generated by this plugin. + +- System-day -> system processes (processes without a cgroup) +- KThread-day -> kernel threads +- caddy-day -> A docker compose project named caddy, with a single container named caddy and a single caddy process + +=head1 CONFIGURATION + +For full docker support, ensure that munin can access docker. F.e. run munin with the docker group: + +[cpu_by_group] +group docker + +=head1 REQUIREMENTS + +- bash v4.3+ +- docker access rights (if using docker) + +=head1 AUTHOR + +Copyright (C) 2025 pimlie + +=head1 LICENSE + +MIT + +=head1 MAGIC MARKERS + + #%# family=auto + #%# capabilities=autoconf + +=cut + +if [ -n "$MUNIN_LIBDIR" ]; then + . "$MUNIN_LIBDIR/plugins/plugin.sh" +fi + +PLUGIN_BASE="$(basename "$0")" +PLUGIN_CACHE="$MUNIN_STATEFILE" + +# Check if docker can be used +function can_use_docker { + if command -v docker >/dev/null; then + if docker ps >/dev/null 2>&1; then + echo 1 + return 0 + fi + fi + + echo 0 + return 1 +} + +HAS_DOCKER="$(can_use_docker)" # is a bool, 1=true + +# Left trim white spaces +function ltrim { + local var="$*" + # remove leading whitespace characters + var="${var#"${var%%[![:space:]]*}"}" + echo "$var" +} + +# +# Create a munin variable safe & human readable cgroup name +# +# Supported human readable names: +# - 'User $user_name' for isolated user processes +# - For docker containers +# - Group by compose.service label (the docker project name) if applicable +# - Add container name, either as group if not docker compose otherwise as graph prefix +# +declare -A docker_cache +function safe_cgroup_name { + local -n local_cgroup_name=$1 + local cgroup="$2" + + # shellcheck disable=SC2076 + if [[ "$cgroup" != *.scope ]] || [[ "$cgroup" == */init.scope ]]; then + # if no cgroup scope, just list as system + cgroup="System" + + # Unless it's a kernel thread + if [ -r "/proc/$3/stat" ]; then + IFS=' ' read -ra stats < "/proc/$3/stat" + local flags + flags=${stats[8]} + pkThread=0x00200000 + + if (( (flags & pkThread) == pkThread )); then + cgroup="KThread" + fi + fi + elif [[ "$cgroup" =~ "/user.slice/user-" ]]; then + # Extract user id and use user name as cgroup name + user_id=${cgroup#*/user.slice/user-} + user_id=${user_id%%.*} + user="$(id -nu "$user_id")" + cgroup="User-${user}" + elif [ "$HAS_DOCKER" -eq 1 ] && [[ "$cgroup" =~ "/docker-" ]]; then + # Extract docker container id and use either compose project otherwise + # container name as cgroup name + docker_id=${cgroup#*/docker-} + docker_id=${docker_id%%.*} + + docker_data="${docker_cache["$docker_id"]}" + if [ -z "$docker_data" ]; then + docker_data="$(docker inspect --format='{{ .Name }} {{ index .Config.Labels "com.docker.compose.project" }}' "$docker_id")" + docker_cache["$docker_id"]="$docker_data" + fi + + IFS=' ' read -ra docker_names <<< "$docker_data" + image_title=${docker_names[0]//*\//} + compose_project="${docker_names[1]}" + + if [ -n "$compose_project" ]; then + cgroup="$compose_project.$image_title" + elif [ -n "$image_title" ]; then + cgroup="$image_title" + fi + else + cgroup=${cgroup##*/} + cgroup=${cgroup%%.*} + fi + + # shellcheck disable=SC2034 + local_cgroup_name="${cgroup//_/-}" +} + +# Format process name as munin safe variable +function safe_proc_name { + local -n local_name=$1 + local name="$2" + name="${name%%/*}" # Remove everything after a / + name="${name%.}" # Remove trailing dot + name="${name//[^a-zA-Z0-9]/_}" # Make Munin var safe + + # shellcheck disable=SC2034 + local_name="$name" +} + +# Format graph name as munin safe variable +function safe_graph_name { + name="${1:-cputime}" # Use cputime as default value if unset + name="${name//|/.}" # Replace any pipes to dots + echo "${name//./_}" # Replace dots with underscores +} + +# Calculate process time in seconds +function calc_proc_time { + local -n local_proc_time=$1 + IFS=":" read -ra time <<< "$2" + + if [ "${#time[@]}" -ne 3 ]; then + echo "Expected time to be in ([days]-)[hour]:[min]:[sec] format, got '$1'" >&2 + exit 1 + fi + + local day=0 + local hour="${time[0]}" + + IFS="-" read -ra dayhour <<< "${time[0]}" + if [ "${#dayhour[@]}" -gt 1 ]; then + day="${dayhour[0]}" + hour="${dayhour[1]}" + fi + + local hours="$(( day * 24 + 10#$hour ))" + + # shellcheck disable=SC2034 + local_proc_time="$(( 10#$hours * 3600 + 10#${time[1]} * 60 + 10#${time[2]}))" +} + +# Get array of all previous & current running processes +function get_processes { + local -n procs=$1 + + if [ -n "$PLUGIN_CACHE" ] && [ -r "$PLUGIN_CACHE" ] && [ -s "$PLUGIN_CACHE" ]; then + while read -r process; do + procs["$process"]=0; + done < "$PLUGIN_CACHE" + fi + + IFS=$'\n' + for proc_line in $(ps -eo time,pid,comm h); do + IFS=' ' read -r -a proc <<< "$proc_line" + + # Create dot separated cgroup / process name breadcrumb + local proc_id + local cgroup_file + proc_id="$(ltrim "${proc[1]}")" + cgroup_file="/proc/$proc_id/cgroup" + # cat cgroup directly from /proc fs, as ps is bad for parsing multiple variable + # width columns and trimming whitespace is quite slow in bash + local cgroup_name="" + if [ -r "$cgroup_file" ]; then + cgroup_name="$(cat "$cgroup_file")" + fi + proc_name="${proc[2]}" + proc_time=0 + + safe_cgroup_name cgroup_name "$cgroup_name" "$proc_id" + safe_proc_name proc_name "$proc_name" + + # Use everything before the first dot as the cgroup name, and everything + # after the first dot as process name. + # This ensures that we can also use prefixes within cgroup's, fe + # for docker containers we want the compose projects as cgroup name + # but still prefix individual processes with their container name + local breadcrumb + local multi_name + local multi_graph_name + breadcrumb="${cgroup_name}.${proc_name}" + multi_name="${breadcrumb%%.*}" + multi_graph_name="${breadcrumb#*.}" + + calc_proc_time proc_time "${proc[0]}" + + if [ "$proc_time" -gt 0 ]; then + procs_key="${multi_name}|${multi_graph_name}" + cur_value="${procs["$procs_key"]}" + + procs["$procs_key"]="$(( cur_value + proc_time ))" + fi + done + + unset IFS +} + +function cache_processes { + local -n proc_names=$1 + + if [ -n "$PLUGIN_CACHE" ] && [ -w "$(dirname "$PLUGIN_CACHE")" ]; then + echo "${proc_names[@]}" | tr ' ' '\n' > "$PLUGIN_CACHE" + fi +} + +# Emit multigraph header +function emit_multigraph_base { + if [ -z "$1" ]; then + echo "multigraph $PLUGIN_BASE" + else + echo "multigraph ${PLUGIN_BASE}_${1//-/_}" + fi +} + +# Emit base graph config +function emit_graph_base { + cat <