mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-21 18:41:03 +00:00
360 lines
8.5 KiB
Bash
Executable file
360 lines
8.5 KiB
Bash
Executable file
#!/bin/bash
|
|
# -*- sh -*-
|
|
|
|
set -e
|
|
|
|
: << =cut
|
|
|
|
=head1 NAME
|
|
|
|
cpu_by_group - Monitors cpu time for all processes on a system and groups them into graphs by either cgroup, system processes or kernel threads
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
Works similar as good ol' cpu_by_process, but groups processes by either their cgroup, system proceses or kernel threads to produce smaller graphs.
|
|
|
|
For docker projects / containers the processes are grouped by either their docker compose project name or container name.
|
|
For user cgroups the user id will be resolved to the corresponding user name.
|
|
|
|
The plugin keeps track of previously running processes in MUNIN_STATEFILE
|
|
|
|
=head2 EXAMPLE GRAPHS
|
|
|
|
There are 3 example graphs which were all automatically generated by this plugin.
|
|
|
|
- System-day -> system processes (processes without a cgroup)
|
|
- KThread-day -> kernel threads
|
|
- caddy-day -> A docker compose project named caddy, with a single container named caddy and a single caddy process
|
|
|
|
=head1 CONFIGURATION
|
|
|
|
For full docker support, ensure that munin can access docker. F.e. run munin with the docker group:
|
|
|
|
[cpu_by_group]
|
|
group docker
|
|
|
|
=head1 REQUIREMENTS
|
|
|
|
- bash v4.3+
|
|
- docker access rights (if using docker)
|
|
|
|
=head1 AUTHOR
|
|
|
|
Copyright (C) 2025 pimlie
|
|
|
|
=head1 LICENSE
|
|
|
|
MIT
|
|
|
|
=head1 MAGIC MARKERS
|
|
|
|
#%# family=auto
|
|
#%# capabilities=autoconf
|
|
|
|
=cut
|
|
|
|
if [ -n "$MUNIN_LIBDIR" ]; then
|
|
. "$MUNIN_LIBDIR/plugins/plugin.sh"
|
|
fi
|
|
|
|
PLUGIN_BASE="$(basename "$0")"
|
|
PLUGIN_CACHE="$MUNIN_STATEFILE"
|
|
|
|
# Check if docker can be used
|
|
function can_use_docker {
|
|
if command -v docker >/dev/null; then
|
|
if docker ps >/dev/null 2>&1; then
|
|
echo 1
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
echo 0
|
|
return 1
|
|
}
|
|
|
|
HAS_DOCKER="$(can_use_docker)" # is a bool, 1=true
|
|
|
|
# Left trim white spaces
|
|
function ltrim {
|
|
local var="$*"
|
|
# remove leading whitespace characters
|
|
var="${var#"${var%%[![:space:]]*}"}"
|
|
echo "$var"
|
|
}
|
|
|
|
#
|
|
# Create a munin variable safe & human readable cgroup name
|
|
#
|
|
# Supported human readable names:
|
|
# - 'User $user_name' for isolated user processes
|
|
# - For docker containers
|
|
# - Group by compose.service label (the docker project name) if applicable
|
|
# - Add container name, either as group if not docker compose otherwise as graph prefix
|
|
#
|
|
declare -A docker_cache
|
|
function safe_cgroup_name {
|
|
local -n local_cgroup_name=$1
|
|
local cgroup="$2"
|
|
|
|
# shellcheck disable=SC2076
|
|
if [[ "$cgroup" != *.scope ]] || [[ "$cgroup" == */init.scope ]]; then
|
|
# if no cgroup scope, just list as system
|
|
cgroup="System"
|
|
|
|
# Unless it's a kernel thread
|
|
if [ -r "/proc/$3/stat" ]; then
|
|
IFS=' ' read -ra stats < "/proc/$3/stat"
|
|
local flags
|
|
flags=${stats[8]}
|
|
pkThread=0x00200000
|
|
|
|
if (( (flags & pkThread) == pkThread )); then
|
|
cgroup="KThread"
|
|
fi
|
|
fi
|
|
elif [[ "$cgroup" =~ "/user.slice/user-" ]]; then
|
|
# Extract user id and use user name as cgroup name
|
|
user_id=${cgroup#*/user.slice/user-}
|
|
user_id=${user_id%%.*}
|
|
user="$(id -nu "$user_id")"
|
|
cgroup="User-${user}"
|
|
elif [ "$HAS_DOCKER" -eq 1 ] && [[ "$cgroup" =~ "/docker-" ]]; then
|
|
# Extract docker container id and use either compose project otherwise
|
|
# container name as cgroup name
|
|
docker_id=${cgroup#*/docker-}
|
|
docker_id=${docker_id%%.*}
|
|
|
|
docker_data="${docker_cache["$docker_id"]}"
|
|
if [ -z "$docker_data" ]; then
|
|
docker_data="$(docker inspect --format='{{ .Name }} {{ index .Config.Labels "com.docker.compose.project" }}' "$docker_id")"
|
|
docker_cache["$docker_id"]="$docker_data"
|
|
fi
|
|
|
|
IFS=' ' read -ra docker_names <<< "$docker_data"
|
|
image_title=${docker_names[0]//*\//}
|
|
compose_project="${docker_names[1]}"
|
|
|
|
if [ -n "$compose_project" ]; then
|
|
cgroup="$compose_project.$image_title"
|
|
elif [ -n "$image_title" ]; then
|
|
cgroup="$image_title"
|
|
fi
|
|
else
|
|
cgroup=${cgroup##*/}
|
|
cgroup=${cgroup%%.*}
|
|
fi
|
|
|
|
# shellcheck disable=SC2034
|
|
local_cgroup_name="${cgroup//_/-}"
|
|
}
|
|
|
|
# Format process name as munin safe variable
|
|
function safe_proc_name {
|
|
local -n local_name=$1
|
|
local name="$2"
|
|
name="${name%%/*}" # Remove everything after a /
|
|
name="${name%.}" # Remove trailing dot
|
|
name="${name//[^a-zA-Z0-9]/_}" # Make Munin var safe
|
|
|
|
# shellcheck disable=SC2034
|
|
local_name="$name"
|
|
}
|
|
|
|
# Format graph name as munin safe variable
|
|
function safe_graph_name {
|
|
name="${1:-cputime}" # Use cputime as default value if unset
|
|
name="${name//|/.}" # Replace any pipes to dots
|
|
echo "${name//./_}" # Replace dots with underscores
|
|
}
|
|
|
|
# Calculate process time in seconds
|
|
function calc_proc_time {
|
|
local -n local_proc_time=$1
|
|
IFS=":" read -ra time <<< "$2"
|
|
|
|
if [ "${#time[@]}" -ne 3 ]; then
|
|
echo "Expected time to be in ([days]-)[hour]:[min]:[sec] format, got '$1'" >&2
|
|
exit 1
|
|
fi
|
|
|
|
local day=0
|
|
local hour="${time[0]}"
|
|
|
|
IFS="-" read -ra dayhour <<< "${time[0]}"
|
|
if [ "${#dayhour[@]}" -gt 1 ]; then
|
|
day="${dayhour[0]}"
|
|
hour="${dayhour[1]}"
|
|
fi
|
|
|
|
local hours="$(( day * 24 + 10#$hour ))"
|
|
|
|
# shellcheck disable=SC2034
|
|
local_proc_time="$(( 10#$hours * 3600 + 10#${time[1]} * 60 + 10#${time[2]}))"
|
|
}
|
|
|
|
# Get array of all previous & current running processes
|
|
function get_processes {
|
|
local -n procs=$1
|
|
|
|
if [ -n "$PLUGIN_CACHE" ] && [ -r "$PLUGIN_CACHE" ] && [ -s "$PLUGIN_CACHE" ]; then
|
|
while read -r process; do
|
|
procs["$process"]=0;
|
|
done < "$PLUGIN_CACHE"
|
|
fi
|
|
|
|
IFS=$'\n'
|
|
for proc_line in $(ps -eo time,pid,comm h); do
|
|
IFS=' ' read -r -a proc <<< "$proc_line"
|
|
|
|
# Create dot separated cgroup / process name breadcrumb
|
|
local proc_id
|
|
local cgroup_file
|
|
proc_id="$(ltrim "${proc[1]}")"
|
|
cgroup_file="/proc/$proc_id/cgroup"
|
|
# cat cgroup directly from /proc fs, as ps is bad for parsing multiple variable
|
|
# width columns and trimming whitespace is quite slow in bash
|
|
local cgroup_name=""
|
|
if [ -r "$cgroup_file" ]; then
|
|
cgroup_name="$(cat "$cgroup_file")"
|
|
fi
|
|
proc_name="${proc[2]}"
|
|
proc_time=0
|
|
|
|
safe_cgroup_name cgroup_name "$cgroup_name" "$proc_id"
|
|
safe_proc_name proc_name "$proc_name"
|
|
|
|
# Use everything before the first dot as the cgroup name, and everything
|
|
# after the first dot as process name.
|
|
# This ensures that we can also use prefixes within cgroup's, fe
|
|
# for docker containers we want the compose projects as cgroup name
|
|
# but still prefix individual processes with their container name
|
|
local breadcrumb
|
|
local multi_name
|
|
local multi_graph_name
|
|
breadcrumb="${cgroup_name}.${proc_name}"
|
|
multi_name="${breadcrumb%%.*}"
|
|
multi_graph_name="${breadcrumb#*.}"
|
|
|
|
calc_proc_time proc_time "${proc[0]}"
|
|
|
|
if [ "$proc_time" -gt 0 ]; then
|
|
procs_key="${multi_name}|${multi_graph_name}"
|
|
cur_value="${procs["$procs_key"]}"
|
|
|
|
procs["$procs_key"]="$(( cur_value + proc_time ))"
|
|
fi
|
|
done
|
|
|
|
unset IFS
|
|
}
|
|
|
|
function cache_processes {
|
|
local -n proc_names=$1
|
|
|
|
if [ -n "$PLUGIN_CACHE" ] && [ -w "$(dirname "$PLUGIN_CACHE")" ]; then
|
|
echo "${proc_names[@]}" | tr ' ' '\n' > "$PLUGIN_CACHE"
|
|
fi
|
|
}
|
|
|
|
# Emit multigraph header
|
|
function emit_multigraph_base {
|
|
if [ -z "$1" ]; then
|
|
echo "multigraph $PLUGIN_BASE"
|
|
else
|
|
echo "multigraph ${PLUGIN_BASE}_${1//-/_}"
|
|
fi
|
|
}
|
|
|
|
# Emit base graph config
|
|
function emit_graph_base {
|
|
cat <<EOF
|
|
graph_title CPU time for ${1//-/ } processes
|
|
graph_args --base 1000
|
|
graph_vlabel seconds
|
|
graph_category processes
|
|
graph_info Shows CPU time used by each process name
|
|
EOF
|
|
}
|
|
|
|
# Emit graph config, default = STACK
|
|
function emit_graph_config {
|
|
local graph_name
|
|
local label
|
|
graph_name="$(safe_graph_name "$1")"
|
|
label="${1:-CPU time}"
|
|
|
|
cat <<EOF
|
|
$graph_name.label $label
|
|
$graph_name.min 0
|
|
$graph_name.type DERIVE
|
|
$graph_name.draw ${2:-STACK}
|
|
EOF
|
|
}
|
|
|
|
# Emit value for process
|
|
function emit_value {
|
|
local graph_name
|
|
graph_name="$(safe_graph_name "$1")"
|
|
|
|
echo "${graph_name}.value $2"
|
|
}
|
|
|
|
# Check if string equals config
|
|
function is_config {
|
|
if [ "$1" == "config" ]; then
|
|
return 0
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
# Check if new cgroup is different from current and if so update
|
|
function is_new_cgroup {
|
|
local -n cur_cgroup=$1
|
|
local new_cgroup=$2
|
|
|
|
if [ -z "$cur_cgroup" ] || [ "$new_cgroup" != "$cur_cgroup" ]; then
|
|
cur_cgroup=$new_cgroup
|
|
return 0
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
case $1 in
|
|
autoconf)
|
|
echo "yes"
|
|
;;
|
|
*)
|
|
declare -A processes
|
|
get_processes processes
|
|
|
|
# shellcheck disable=SC2207
|
|
IFS=$'\n' process_names=($(sort -f <<<"${!processes[*]}")); unset IFS
|
|
|
|
cgroup=""
|
|
for process_name in "${process_names[@]}"; do
|
|
IFS="|" read -ra proc <<< "$process_name"
|
|
|
|
if is_new_cgroup cgroup "${proc[0]}"; then
|
|
emit_multigraph_base "$cgroup"
|
|
|
|
if is_config "$1"; then
|
|
emit_graph_base "$cgroup"
|
|
type="AREA"
|
|
fi
|
|
else
|
|
type="STACK"
|
|
fi
|
|
|
|
if is_config "$1"; then
|
|
emit_graph_config "${proc[1]}" "$type"
|
|
else
|
|
emit_value "${proc[1]}" "${processes[$process_name]}"
|
|
fi
|
|
done
|
|
|
|
cache_processes process_names
|
|
;;
|
|
esac
|