mirror of
https://github.com/munin-monitoring/contrib.git
synced 2025-07-22 14:16:00 +00:00
gluster: don't send warning for self-healing errors if they get fixed
Change value from "current absolute value" to "15min average value" and tune warning limits accordingly. Healing value can sometimes be 1 and gluster will fix it by itself. Send warning only if it stays at 1 (or above) for 15 minutes. This will still send a warning immediately if it goes to 3 or above.
This commit is contained in:
parent
86e3466b9c
commit
94dc3ff4a3
1 changed files with 35 additions and 12 deletions
|
@ -33,7 +33,9 @@ GPLv2
|
||||||
=cut
|
=cut
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import pathlib
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
@ -42,17 +44,18 @@ import xml.etree.ElementTree
|
||||||
|
|
||||||
def run_command(command):
|
def run_command(command):
|
||||||
"""Run gluster command and return it's output as etree."""
|
"""Run gluster command and return it's output as etree."""
|
||||||
for _dummy_retry in range(3):
|
full_cmd = ['gluster', '--mode=script', '--xml'] + command
|
||||||
|
for sleep_retry in (1, 2, 3, 4, 5): # retry for 15 seconds
|
||||||
try:
|
try:
|
||||||
text = subprocess.run(['gluster', '--mode=script', '--xml'] +
|
text = subprocess.run(full_cmd, check=False,
|
||||||
command, check=False, stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
encoding='utf-8').stdout
|
encoding='utf-8').stdout
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
return None
|
return None
|
||||||
if text.count('\n') > 10:
|
if text.count('\n') > 10:
|
||||||
break
|
break
|
||||||
time.sleep(2) # Sleep and retry
|
time.sleep(sleep_retry)
|
||||||
try:
|
try:
|
||||||
return xml.etree.ElementTree.fromstring(text)
|
return xml.etree.ElementTree.fromstring(text)
|
||||||
except xml.etree.ElementTree.ParseError:
|
except xml.etree.ElementTree.ParseError:
|
||||||
|
@ -140,6 +143,26 @@ def find_volumes(need_details):
|
||||||
return volumes
|
return volumes
|
||||||
|
|
||||||
|
|
||||||
|
def print_avg(label, value):
|
||||||
|
"""Use state file to print average value instead of gauge."""
|
||||||
|
# Read previous values from state file
|
||||||
|
statefile = pathlib.Path(os.getenv('MUNIN_PLUGSTATE')) / 'gluster.json'
|
||||||
|
try:
|
||||||
|
state = json.loads(statefile.read_text('utf-8'))
|
||||||
|
except (FileNotFoundError, json.decoder.JSONDecodeError):
|
||||||
|
state = {}
|
||||||
|
|
||||||
|
# Add current value to state file
|
||||||
|
values = state.get(label, []) + [int(value)]
|
||||||
|
while len(values) > 3: # Keep current + 2 previous = 15 minutes
|
||||||
|
del values[0]
|
||||||
|
state[label] = values
|
||||||
|
statefile.write_text(json.dumps(state, indent=2), 'utf-8')
|
||||||
|
|
||||||
|
# Print average value
|
||||||
|
print(f'{label}.value {sum(values) / len(values)}')
|
||||||
|
|
||||||
|
|
||||||
def print_status(config):
|
def print_status(config):
|
||||||
"""Print config or values."""
|
"""Print config or values."""
|
||||||
# pylint: disable=too-many-branches
|
# pylint: disable=too-many-branches
|
||||||
|
@ -162,12 +185,12 @@ def print_status(config):
|
||||||
name = safe_name(volume['name'])
|
name = safe_name(volume['name'])
|
||||||
if config or both:
|
if config or both:
|
||||||
print(f'{name}.label Volume {volume["name"]}')
|
print(f'{name}.label Volume {volume["name"]}')
|
||||||
print(f'{name}.warning 2:')
|
print(f'{name}.warning 1.01:')
|
||||||
if not config or both:
|
if not config or both:
|
||||||
status = int(volume['status'])
|
status = int(volume['status'])
|
||||||
if status and all(brick['status'] for brick in volume['bricks']):
|
if status and all(brick['status'] for brick in volume['bricks']):
|
||||||
status = 2
|
status = 2
|
||||||
print(f'{name}.value {status}')
|
print_avg(name, status)
|
||||||
|
|
||||||
# Brick heal status
|
# Brick heal status
|
||||||
for volume in volumes:
|
for volume in volumes:
|
||||||
|
@ -182,19 +205,19 @@ def print_status(config):
|
||||||
bname = safe_name(brick['uuid'])
|
bname = safe_name(brick['uuid'])
|
||||||
print(f'{bname}_pending.label {brick["name"]} '
|
print(f'{bname}_pending.label {brick["name"]} '
|
||||||
'in heal pending')
|
'in heal pending')
|
||||||
print(f'{bname}_pending.warning 0')
|
print(f'{bname}_pending.warning 0.99')
|
||||||
print(f'{bname}_split.label {brick["name"]} '
|
print(f'{bname}_split.label {brick["name"]} '
|
||||||
'in split-brain')
|
'in split-brain')
|
||||||
print(f'{bname}_split.warning 0')
|
print(f'{bname}_split.warning 0.99')
|
||||||
print(f'{bname}_healing.label {brick["name"]} '
|
print(f'{bname}_healing.label {brick["name"]} '
|
||||||
'possibly healing')
|
'possibly healing')
|
||||||
print(f'{bname}_healing.warning 0')
|
print(f'{bname}_healing.warning 0.99')
|
||||||
if not config or both:
|
if not config or both:
|
||||||
for brick in volume['bricks']:
|
for brick in volume['bricks']:
|
||||||
bname = safe_name(brick['uuid'])
|
bname = safe_name(brick['uuid'])
|
||||||
print(f'{bname}_pending.value {brick["heal_pending"]}')
|
print_avg(f'{bname}_pending', brick['heal_pending'])
|
||||||
print(f'{bname}_split.value {brick["heal_split"]}')
|
print_avg(f'{bname}_split', brick['heal_split'])
|
||||||
print(f'{bname}_healing.value {brick["heal_healing"]}')
|
print_avg(f'{bname}_healing', brick['heal_healing'])
|
||||||
|
|
||||||
# Brick disk/inode free
|
# Brick disk/inode free
|
||||||
for gtype in ('disk', 'inode'):
|
for gtype in ('disk', 'inode'):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue