feat: add cpu_by_group plugin (#1468)

* feat: add cpu_by_group plugin The current cpu_by_process plugin says it self it can generate huge graphs which are difficult to decipher. This plugin tries to mitigate that issue by automatically grouping processes by their cgroup. This also solves the issue that cpu_by_process groups processes by process name, which means atm you cant really differentiate between f.e. python processes in various cgroups. The remaining processes that are not running in a cgroup are also split between kernel threads and system processes For user & docker cgroups a human readable cgroup label is constructed (if munin has f.e. docker access). The function in this plugin could probably be expanded for other often used cgroups * chore: add example graphs & update docs * chore: update docs
2025-07-21 02:33:18 +00:00 · 2025-01-07 09:11:48 +01:00 · 2025-01-07 09:11:48 +01:00 · 67024140dd
commit 67024140dd
parent 50c90dec07
4 changed files with 360 additions and 0 deletions
--- a/plugins/cpu/cpu_by_group
+++ b/plugins/cpu/cpu_by_group
@ -0,0 +1,360 @@
+#!/bin/bash
+# -*- sh -*-
+
+set -e
+
+: << =cut
+
+=head1 NAME
+
+cpu_by_group - Monitors cpu time for all processes on a system and groups them into graphs by either cgroup, system processes or kernel threads
+
+=head1 DESCRIPTION
+
+Works similar as good ol' cpu_by_process, but groups processes by either their cgroup, system proceses or kernel threads to produce smaller graphs.
+
+For docker projects / containers the processes are grouped by either their docker compose project name or container name.
+For user cgroups the user id will be resolved to the corresponding user name.
+
+The plugin keeps track of previously running processes in MUNIN_STATEFILE
+
+=head2 EXAMPLE GRAPHS
+
+There are 3 example graphs which were all automatically generated by this plugin.
+
+- System-day -> system processes (processes without a cgroup)
+- KThread-day -> kernel threads
+- caddy-day -> A docker compose project named caddy, with a single container named caddy and a single caddy process 
+
+=head1 CONFIGURATION
+
+For full docker support, ensure that munin can access docker. F.e. run munin with the docker group:
+
+[cpu_by_group]
+group docker
+
+=head1 REQUIREMENTS
+
+- bash v4.3+
+- docker access rights (if using docker)
+
+=head1 AUTHOR
+
+Copyright (C) 2025 pimlie
+
+=head1 LICENSE
+
+MIT
+
+=head1 MAGIC MARKERS
+
+ #%# family=auto
+ #%# capabilities=autoconf
+
+=cut
+
+if [ -n "$MUNIN_LIBDIR" ]; then
+	. "$MUNIN_LIBDIR/plugins/plugin.sh"
+fi
+
+PLUGIN_BASE="$(basename "$0")"
+PLUGIN_CACHE="$MUNIN_STATEFILE"
+
+# Check if docker can be used
+function can_use_docker {
+	if command -v docker >/dev/null; then
+		if docker ps >/dev/null 2>&1; then
+			echo 1
+			return 0
+		fi
+	fi
+
+	echo 0
+	return 1
+}
+
+HAS_DOCKER="$(can_use_docker)" # is a bool, 1=true
+
+# Left trim white spaces
+function ltrim {
+    local var="$*"
+    # remove leading whitespace characters
+    var="${var#"${var%%[![:space:]]*}"}"
+    echo "$var"
+}
+
+#
+# Create a munin variable safe & human readable cgroup name
+#
+# Supported human readable names:
+# - 'User $user_name' for isolated user processes
+# - For docker containers
+#   - Group by compose.service label (the docker project name) if applicable
+#   - Add container name, either as group if not docker compose otherwise as graph prefix
+#
+declare -A docker_cache
+function safe_cgroup_name {
+	local -n local_cgroup_name=$1
+	local cgroup="$2"
+
+	# shellcheck disable=SC2076
+	if [[ "$cgroup" != *.scope ]] || [[ "$cgroup" == */init.scope ]]; then
+		# if no cgroup scope, just list as system
+		cgroup="System"
+
+		# Unless it's a kernel thread
+		if [ -r "/proc/$3/stat" ]; then
+			IFS=' ' read -ra stats < "/proc/$3/stat"
+			local flags
+			flags=${stats[8]}
+			pkThread=0x00200000
+
+			if (( (flags & pkThread) == pkThread )); then
+				cgroup="KThread"
+			fi
+		fi
+	elif [[ "$cgroup" =~ "/user.slice/user-" ]]; then
+		# Extract user id and use user name as cgroup name
+		user_id=${cgroup#*/user.slice/user-}
+		user_id=${user_id%%.*}
+		user="$(id -nu "$user_id")"
+		cgroup="User-${user}"
+	elif [ "$HAS_DOCKER" -eq 1 ] && [[ "$cgroup" =~ "/docker-" ]]; then
+		# Extract docker container id and use either compose project otherwise
+		# container name as cgroup name
+		docker_id=${cgroup#*/docker-}
+		docker_id=${docker_id%%.*}
+
+		docker_data="${docker_cache["$docker_id"]}"
+		if [ -z "$docker_data" ]; then
+			docker_data="$(docker inspect --format='{{ .Name }} {{ index .Config.Labels "com.docker.compose.project" }}' "$docker_id")"
+			docker_cache["$docker_id"]="$docker_data"
+		fi
+
+		IFS=' ' read -ra docker_names <<< "$docker_data"
+		image_title=${docker_names[0]//*\//}
+		compose_project="${docker_names[1]}"
+
+		if [ -n "$compose_project" ]; then
+			cgroup="$compose_project.$image_title"
+		elif [ -n "$image_title" ]; then
+			cgroup="$image_title"
+		fi
+	else
+		cgroup=${cgroup##*/}
+		cgroup=${cgroup%%.*}
+	fi
+
+	# shellcheck disable=SC2034
+	local_cgroup_name="${cgroup//_/-}"
+}
+
+# Format process name as munin safe variable
+function safe_proc_name {
+	local -n local_name=$1
+	local name="$2"
+	name="${name%%/*}" # Remove everything after a /
+	name="${name%.}" # Remove trailing dot
+	name="${name//[^a-zA-Z0-9]/_}" # Make Munin var safe
+
+	# shellcheck disable=SC2034
+	local_name="$name"
+}
+
+# Format graph name as munin safe variable
+function safe_graph_name {
+	name="${1:-cputime}" # Use cputime as default value if unset
+	name="${name//|/.}" # Replace any pipes to dots
+	echo "${name//./_}" # Replace dots with underscores
+}
+
+# Calculate process time in seconds
+function calc_proc_time {
+	local -n local_proc_time=$1
+	IFS=":" read -ra time <<< "$2"
+
+	if [ "${#time[@]}" -ne 3 ]; then
+		echo "Expected time to be in ([days]-)[hour]:[min]:[sec] format, got '$1'" >&2
+		exit 1
+	fi
+
+	local day=0
+	local hour="${time[0]}"
+
+	IFS="-" read -ra dayhour <<< "${time[0]}"
+	if [ "${#dayhour[@]}" -gt 1 ]; then
+		day="${dayhour[0]}"
+		hour="${dayhour[1]}"
+	fi
+
+	local hours="$(( day * 24 + 10#$hour ))"
+
+	# shellcheck disable=SC2034
+	local_proc_time="$(( 10#$hours * 3600 + 10#${time[1]} * 60 + 10#${time[2]}))"
+}
+
+# Get array of all previous & current running processes
+function get_processes {
+	local -n procs=$1
+
+	if [ -n "$PLUGIN_CACHE" ] && [ -r "$PLUGIN_CACHE" ] && [ -s "$PLUGIN_CACHE" ]; then
+		while read -r process; do
+			procs["$process"]=0;
+		done < "$PLUGIN_CACHE"
+	fi
+
+	IFS=$'\n'
+	for proc_line in $(ps -eo time,pid,comm h); do
+		IFS=' ' read -r -a proc <<< "$proc_line"
+
+		# Create dot separated cgroup / process name breadcrumb
+		local proc_id
+		local cgroup_file
+		proc_id="$(ltrim "${proc[1]}")"
+		cgroup_file="/proc/$proc_id/cgroup"
+		# cat cgroup directly from /proc fs, as ps is bad for parsing multiple variable
+		# width columns and trimming whitespace is quite slow in bash
+		local cgroup_name=""
+		if [ -r "$cgroup_file" ]; then
+			cgroup_name="$(cat "$cgroup_file")"
+		fi
+		proc_name="${proc[2]}"
+		proc_time=0
+
+		safe_cgroup_name cgroup_name "$cgroup_name" "$proc_id"
+		safe_proc_name proc_name "$proc_name"
+
+		# Use everything before the first dot as the cgroup name, and everything
+		# after the first dot as process name.
+		# This ensures that we can also use prefixes within cgroup's, fe
+		# for docker containers we want the compose projects as cgroup name
+		# but still prefix individual processes with their container name
+		local breadcrumb
+		local multi_name
+		local multi_graph_name
+		breadcrumb="${cgroup_name}.${proc_name}"
+		multi_name="${breadcrumb%%.*}"
+		multi_graph_name="${breadcrumb#*.}"
+
+		calc_proc_time proc_time "${proc[0]}"
+
+		if [ "$proc_time" -gt 0 ]; then
+			procs_key="${multi_name}|${multi_graph_name}"
+			cur_value="${procs["$procs_key"]}"
+
+			procs["$procs_key"]="$(( cur_value + proc_time ))"
+		fi
+	done
+
+	unset IFS
+}
+
+function cache_processes {
+	local -n proc_names=$1
+
+	if [ -n "$PLUGIN_CACHE" ] && [ -w "$(dirname "$PLUGIN_CACHE")" ]; then
+		echo "${proc_names[@]}" | tr ' ' '\n' > "$PLUGIN_CACHE"
+	fi
+}
+
+# Emit multigraph header
+function emit_multigraph_base {
+	if [ -z "$1" ]; then
+		echo "multigraph $PLUGIN_BASE"
+	else
+		echo "multigraph ${PLUGIN_BASE}_${1//-/_}"
+	fi
+}
+
+# Emit base graph config
+function emit_graph_base {
+	cat <<EOF
+graph_title CPU time for ${1//-/ } processes
+graph_args --base 1000
+graph_vlabel seconds
+graph_category processes
+graph_info Shows CPU time used by each process name
+EOF
+}
+
+# Emit graph config, default = STACK
+function emit_graph_config {
+	local graph_name
+	local label
+	graph_name="$(safe_graph_name "$1")"
+	label="${1:-CPU time}"
+
+	cat <<EOF
+$graph_name.label $label
+$graph_name.min 0
+$graph_name.type DERIVE
+$graph_name.draw ${2:-STACK}
+EOF
+}
+
+# Emit value for process
+function emit_value {
+	local graph_name
+	graph_name="$(safe_graph_name "$1")"
+
+	echo "${graph_name}.value $2"
+}
+
+# Check if string equals config
+function is_config {
+	if [ "$1" == "config" ]; then
+		return 0
+	fi
+
+	return 1
+}
+
+# Check if new cgroup is different from current and if so update
+function is_new_cgroup {
+	local -n cur_cgroup=$1
+	local new_cgroup=$2
+
+	if [ -z "$cur_cgroup" ] || [ "$new_cgroup" != "$cur_cgroup" ]; then
+		cur_cgroup=$new_cgroup
+		return 0
+	fi
+
+	return 1
+}
+
+case $1 in
+    autoconf)
+		echo "yes"
+        ;;
+    *)
+		declare -A processes
+		get_processes processes
+
+		# shellcheck disable=SC2207
+		IFS=$'\n' process_names=($(sort -f <<<"${!processes[*]}")); unset IFS
+
+		cgroup=""
+		for process_name in "${process_names[@]}"; do
+			IFS="|" read -ra proc <<< "$process_name"
+
+			if is_new_cgroup cgroup "${proc[0]}"; then
+				emit_multigraph_base "$cgroup"
+
+				if is_config "$1"; then
+					emit_graph_base "$cgroup"
+					type="AREA"
+				fi
+			else
+				type="STACK"
+			fi
+
+			if is_config "$1"; then
+				emit_graph_config "${proc[1]}" "$type"
+			else
+				emit_value "${proc[1]}" "${processes[$process_name]}"
+			fi
+		done
+
+		cache_processes process_names
+		;;
+esac
--- a/plugins/cpu/example-graphs/cpu_by_group_KThread-day.png
+++ b/plugins/cpu/example-graphs/cpu_by_group_KThread-day.png
--- a/plugins/cpu/example-graphs/cpu_by_group_System-day.png
+++ b/plugins/cpu/example-graphs/cpu_by_group_System-day.png
--- a/plugins/cpu/example-graphs/cpu_by_group_caddy-day.png
+++ b/plugins/cpu/example-graphs/cpu_by_group_caddy-day.png