diff --git a/CheckCeph/README.md b/CheckCeph/README.md deleted file mode 100644 index 298e222..0000000 --- a/CheckCeph/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# Quick howto - -With curl : - - curl -s https://gitlab.altinea.fr/altinea/install-scripts/src/branch/master/CheckCeph/check_ceph.py |/bin/bash - -With wget : - - wget -q -O - https://gitlab.altinea.fr/altinea/install-scripts/src/branch/master/CheckCeph/check_ceph.py |/bin/bash -t diff --git a/CheckCeph/check_ceph.py b/CheckCeph/check_ceph.py deleted file mode 100644 index 8624abf..0000000 --- a/CheckCeph/check_ceph.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/python2 - -# Examples: -# osd status, warn at 2 missing, crit at 3: ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --osd -w 2 -c 3 -# general health statis: /check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --health -# pg status, does not take warning or critical arguments yet. Only warns on PGs not in an active+clean state which means some PGs are not in an optimal state. ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --pg -# extra performance metrics (iops, read/write bytes/sec): ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --perf -# disk space, if run with --pool you only alert on that pool. when run without --pool the thresholds are for every pool. warning and ciritcal are the max avail fields from `ceph df`: ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --df -w 100 -c 50 -# -# -import sys -import argparse -import json -import subprocess - -# ceph osd stat -# ceph mon stat -# ceph pg stat -# ceph health statua -# ceph mon_status -# ceph quorum status - - -def checkHealth(args): - - ceph_health_json = subprocess.check_output( - ["ceph --id {0} -c {1} -k {2} --format json health".format(args.id, args.conf, args.keyring)], shell=True) - ceph_health_dict = json.loads(ceph_health_json) - - if ceph_health_dict['status'] == 'HEALTH_ERR': - try: - print "%s: %s" % (ceph_health_dict['overall_status'], ceph_health_dict['summary'][0]['summary']) - except KeyError: - print "%s: %s" % (ceph_health_dict['status'], ceph_health_dict['checks'].keys()[0]) - sys.exit(2) - elif ceph_health_dict['status'] == 'HEALTH_WARN': - try: - print "%s: %s" % (ceph_health_dict['overall_status'], ceph_health_dict['summary'][0]['summary']) - except KeyError: - print "%s: %s" % (ceph_health_dict['status'], ceph_health_dict['checks'].keys()[0]) - sys.exit(1) - elif ceph_health_dict['status'] == 'HEALTH_OK': - print "%s" % (ceph_health_dict['status']) - sys.exit(0) - - -def checkOSD(args): - if args.warning: - WARN = float(args.warning) - if args.critical: - CRIT = float(args.critical) - osd_stat_json = subprocess.check_output( - ["ceph --id {0} -c {1} -k {2} --format json osd stat".format(args.id, args.conf, args.keyring)], shell=True) - osd_stat_dict = json.loads(osd_stat_json) - try: - osd_not_up = osd_stat_dict['num_osds'] - osd_stat_dict['num_up_osds'] - except KeyError: - osd_stat_dict = osd_stat_dict['osdmap'] - osd_not_up = osd_stat_dict['num_osds'] - osd_stat_dict['num_up_osds'] - - osd_not_in = osd_stat_dict['num_osds'] - osd_stat_dict['num_in_osds'] - perf_string = "num_osds={0} num_up_osds={1} num_in_osds={2}".format( - osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds']) - -# Build in logic to handle the full and near full keys that are returned in the json - if (osd_not_up >= WARN and osd_not_up < CRIT) or (osd_not_in >= WARN and osd_not_in < CRIT): - print "WARNING: ALL OSDs are not up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string) - sys.exit(1) - elif (osd_not_up >= CRIT) or (osd_not_in >= CRIT): - print "CRITICAL: ALL OSDs are not up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string) - sys.exit(2) - elif (osd_stat_dict['num_osds'] == osd_stat_dict['num_in_osds']) and (osd_stat_dict['num_osds'] == osd_stat_dict['num_up_osds']): - print "ALL OSDs are up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string) - sys.exit(0) - else: - print "Script shouldn't reach this point. Thar be bugs!" - sys.exit(3) - - -def checkMON(args): - if args.warning: - WARN = float(args.warning) - if args.critical: - CRIT = float(args.critical) - # not written yet, more important things - - -def checkPG(args): - pg_stat_json = subprocess.check_output( - ["ceph --id {0} -c {1} -k {2} --format json pg stat".format(args.id, args.conf, args.keyring)], shell=True) - pg_stat_dict = json.loads(pg_stat_json) - # cheap fix for nautilus change in json output - if 'num_pgs' in pg_stat_dict.keys(): - # pre nautilus json format - pg_summary = pg_stat_dict - elif 'pg_summary' in pg_stat_dict.keys(): - # nautilus json format - pg_summary = pg_stat_dict['pg_summary'] - num_pgs = pg_summary['num_pgs'] - active_pgs = 0 - perf_string = "" - for x in pg_summary['num_pg_by_state']: - if "active+clean" in x['name']: - active_pgs += x['num'] - perf_string += "%s=%s " % (x['name'], x['num']) -# Maybe build in a percentage based threshold for users who want to have thresholds like that - if active_pgs < num_pgs: - print "WARNING: All PGs are not active+clean: {0} PGs Total, {1}|{1}".format(num_pgs, perf_string) - sys.exit(1) - elif active_pgs == num_pgs: - print "All PGs are active+clean: {0} PGs Total, {1}|{1}".format(num_pgs, perf_string) - sys.exit(0) - else: - print "Script shouldn't reach this point. Thar be bugs!" - sys.exit(3) - - -def checkPerf(args): - pg_stat_json = subprocess.check_output( - ["ceph --id {0} -c {1} -k {2} --format json pg stat".format(args.id, args.conf, args.keyring)], shell=True) - pg_stat_dict = json.loads(pg_stat_json) - if 'read_bytes_sec' not in pg_stat_dict: - pg_stat_dict['read_bytes_sec'] = 0 - if 'write_bytes_sec' not in pg_stat_dict: - pg_stat_dict['write_bytes_sec'] = 0 - if 'io_sec' not in pg_stat_dict: - pg_stat_dict['io_sec'] = 0 - perf_string = "read_bytes_sec={0} write_bytes_sec={1} io_sec={2}".format( - pg_stat_dict['read_bytes_sec'], pg_stat_dict['write_bytes_sec'], pg_stat_dict['io_sec']) - print "Healthy: Additional perf stats for cluster {0}|{0}".format(perf_string) - sys.exit(0) - - -def checkDF(args): - if args.warning: - WARN = float(args.warning) - if args.critical: - CRIT = float(args.critical) - if args.byte: - if args.byte == "T": - byte_divisor = 1024**4 - perf_metric = "TB" - elif args.byte == "G": - byte_divisor = 1024**3 - perf_metric = "GB" - elif args.byte == "P": - byte_divisor = 1024**5 - perf_metric = "PB" - else: - byte_divisor = 1024**4 - perf_metric = "TB" - - ceph_df_json = subprocess.check_output( - ["ceph --id {0} -c {1} -k {2} --format json df".format(args.id, args.conf, args.keyring)], shell=True) - ceph_df_dict = json.loads(ceph_df_json) - # get global stats - global_bytes, global_used_bytes, global_avail_bytes = ceph_df_dict['stats'][ - 'total_bytes'], ceph_df_dict['stats']['total_used_bytes'], ceph_df_dict['stats']['total_avail_bytes'] - global_total = global_bytes/byte_divisor - global_used = global_used_bytes/byte_divisor - global_avail = global_avail_bytes/byte_divisor - - # get all pool stats - pool_stats = {} - for pool in ceph_df_dict['pools']: - pool_stats[pool['name']] = {'bytes_used': pool['stats']['bytes_used']/byte_divisor, - 'max_avail': pool['stats']['max_avail']/byte_divisor, 'objects': pool['stats']['objects']} - - perf_string = "global_total_bytes={0}{3} global_used_bytes={1}{3} global_avail_bytes={2}{3} ".format( - global_bytes/byte_divisor, global_used_bytes/byte_divisor, global_avail_bytes/byte_divisor, perf_metric) - for item in pool_stats.keys(): - perf_string += "{0}_bytes_used={1}{2} {0}_max_avail={3}{2} {0}_objects={4} ".format( - item, pool_stats[item]['bytes_used'], perf_metric, pool_stats[item]['max_avail'], pool_stats[item]['objects']) - -# if pool is defined alert on that. if pool is not defined alert on the max_avail of all pools if any cross threshold - if args.pool in pool_stats.keys(): - # print pool_stats[args.pool] - # add in percentage later - if (pool_stats[args.pool]['max_avail'] < WARN) and (pool_stats[args.pool]['max_avail'] > CRIT): - print "WARNING: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string) - sys.exit(1) - elif pool_stats[args.pool]['max_avail'] < CRIT: - print "CRITICAL: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string) - sys.exit(2) - elif pool_stats[args.pool]['max_avail'] > WARN: - print "Healthy: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string) - sys.exit(0) - else: - print "Script shouldn't reach this point. Thar be bugs!" - sys.exit(3) - - else: - # Alerts based on all pools. If any pool is crossing the threshold we alert on it - warn_list = [] - crit_list = [] - - for key in pool_stats.keys(): - if (pool_stats[key]['max_avail'] < WARN) and (pool_stats[key]['max_avail'] > CRIT): - warn_list.append("%s:%s%s" % ( - key, pool_stats[key]['max_avail'], perf_metric)) - elif pool_stats[key]['max_avail'] < CRIT: - crit_list.append("%s:%s%s" % ( - key, pool_stats[key]['max_avail'], perf_metric)) - - if (len(warn_list) > 0) and (len(crit_list) == 0): - print "WARNING: Ceph pool(s) low on free space. {0}|{1}".format(warn_list, perf_string) - sys.exit(1) - elif len(crit_list) > 0: - print "CRITICAL: Ceph pool(s) critically low on free space. Critial:{0} Warning:{1}|{2}".format(crit_list, warn_list, perf_string) - sys.exit(2) - elif (len(warn_list) == 0) and (len(crit_list) == 0): - print "Healthy: All ceph pools are within free space thresholds|{0}".format(perf_string) - else: - print "Script shouldn't reach this point. Thar be bugs!" - sys.exit(3) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='Runs health checks against a ceph cluster. This is designed to run on the monitoring server using the ceph client software. Supply a ceph.conf, keyring, and user to access the cluster.') - parser.add_argument( - '-C', '--conf', help='ceph.conf file, defaults to /etc/ceph/ceph.conf.') - parser.add_argument('-id', '--id', help='Ceph authx user', required=True) - parser.add_argument( - '-k', '--keyring', help='Path to ceph keyring if not in /etc/ceph/client.\$id.keyring') - parser.add_argument( - '--health', help='Get general health status. ex. HEALTH_OK, HEALTH_WARN', action="store_true") - parser.add_argument( - '-o', '--osd', help='OSD status. Thresholds are in number of OSDs missing', action="store_true") - parser.add_argument( - '-m', '--mon', help='MON status. Thesholds are in number of mons missing') - parser.add_argument( - '-p', '--pg', help='PG status. No thresholds due to the large number of pg states.', action="store_true") - parser.add_argument( - '--perf', help='collects additional ceph performance statistics', action='store_true') - parser.add_argument('--df', help='Disk/cluster usage. Reports global and all pools unless --pool is used. Warning and critical are number of -b free to the pools. This is not Raw Free, but Max Avail to the pools based on rep or k,m settings. If you do not define a pool the threshold is run agains all the pools in the cluster.', action="store_true") - parser.add_argument( - '-b', '--byte', help="Format to use for displaying DF data. G=Gigabyte, T=Terabyte. Use with the --df option. Defults to TB") - parser.add_argument('--pool', help='Pool. Use with df') - parser.add_argument('--objects', help='Object counts based on pool') - parser.add_argument( - '-w', '--warning', help='Warning threshold. See specific checks for value types') - parser.add_argument( - '-c', '--critical', help='Critical threshold. See specific checks for value types') - - args = parser.parse_args() - - if args.health: - checkHealth(args) - elif args.osd: - checkOSD(args) - elif args.pg: - checkPG(args) - elif args.df: - checkDF(args) - elif args.perf: - checkPerf(args) diff --git a/CheckZpools/README.md b/CheckZpools/README.md deleted file mode 100644 index a15b173..0000000 --- a/CheckZpools/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# Quick howto - -With curl : - - curl -s https://gitlab.altinea.fr/altinea/install-scripts/src/branch/master/CheckZpools/check_zpools.sh |/bin/bash - -With wget : - - wget -q -O - https://gitlab.altinea.fr/altinea/install-scripts/src/branch/master/CheckZpools/check_zpools.sh |/bin/bash diff --git a/CheckZpools/check_zpools.sh b/CheckZpools/check_zpools.sh deleted file mode 100644 index 290809c..0000000 --- a/CheckZpools/check_zpools.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env bash -######################################################################### -# Script: check_zpools.sh -# Purpose: Nagios plugin to monitor status of zfs pool -# Authors: Aldo Fabi First version (2006-09-01) -# Vitaliy Gladkevitch Forked (2013-02-04) -# Claudio Kuenzler Complete redo, perfdata, etc (2013-2014) -# Doc: http://www.claudiokuenzler.com/nagios-plugins/check_zpools.php -# History: -# 2006-09-01 Original first version -# 2006-10-04 Updated (no change history known) -# 2013-02-04 Forked and released -# 2013-05-08 Make plugin work on different OS, pepp up plugin -# 2013-05-09 Bugfix in exit code handling -# 2013-05-10 Removed old exit vars (not used anymore) -# 2013-05-21 Added performance data (percentage used) -# 2013-07-11 Bugfix in zpool health check -# 2014-02-10 Bugfix in threshold comparison -# 2014-03-11 Allow plugin to run without enforced thresholds -######################################################################### -### Begin vars -STATE_OK=0 # define the exit code if status is OK -STATE_WARNING=1 # define the exit code if status is Warning -STATE_CRITICAL=2 # define the exit code if status is Critical -STATE_UNKNOWN=3 # define the exit code if status is Unknown -# Set path -PATH=$PATH:/usr/sbin:/sbin -export PATH -### End vars -######################################################################### -help="check_zpools.sh (c) 2006-2014 several authors\n -Usage: $0 -p (poolname|ALL) [-w warnpercent] [-c critpercent]\n -Example: $0 -p ALL -w 80 -c 90" -######################################################################### -# Check necessary commands are available -for cmd in zpool awk [ -do - if ! `which ${cmd} 1>/dev/null` - then - echo "UNKNOWN: ${cmd} does not exist, please check if command exists and PATH is correct" - exit ${STATE_UNKNOWN} - fi -done -######################################################################### -# Check for people who need help - aren't we all nice ;-) -if [ "${1}" = "--help" -o "${#}" = "0" ]; - then - echo -e "${help}"; - exit ${STATE_UNKNOWN}; -fi -######################################################################### -# Get user-given variables -while getopts "p:w:c:" Input; -do - case ${Input} in - p) pool=${OPTARG};; - w) warn=${OPTARG};; - c) crit=${OPTARG};; - *) echo -e $help - exit $STATE_UNKNOWN - ;; - esac -done -######################################################################### -# Did user obey to usage? -if [ -z $pool ]; then echo -e $help; exit ${STATE_UNKNOWN}; fi -######################################################################### -# Verify threshold sense -if [[ -n $warn ]] && [[ -z $crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi -if [[ -z $warn ]] && [[ -n $crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi -if [[ $warn -gt $crit ]]; then echo "Warning threshold cannot be greater than critical"; exit $STATE_UNKNOWN; fi -######################################################################### -# What needs to be checked? -## Check all pools -if [ $pool = "ALL" ] -then - POOLS=($(zpool list -Ho name)) - p=0 - for POOL in ${POOLS[*]} - do - CAPACITY=$(zpool list -Ho capacity $POOL | awk -F"%" '{print $1}') - HEALTH=$(zpool list -Ho health $POOL) - # Check with thresholds - if [[ -n $warn ]] && [[ -n $crit ]] - then - if [[ $CAPACITY -ge $crit ]] - then error[${p}]="POOL $POOL usage is CRITICAL (${CAPACITY}%)"; fcrit=1 - elif [[ $CAPACITY -ge $warn && $CAPACITY -lt $crit ]] - then error[$p]="POOL $POOL usage is WARNING (${CAPACITY}%)" - elif [ $HEALTH != "ONLINE" ] - then error[${p}]="$POOL health is $HEALTH"; fcrit=1 - fi - # Check without thresholds - else - if [ $HEALTH != "ONLINE" ] - then error[${p}]="$POOL health is $HEALTH"; fcrit=1 - fi - fi - perfdata[$p]="$POOL=${CAPACITY}% " - let p++ - done - - if [[ ${#error[*]} -gt 0 ]] - then - if [[ $fcrit -eq 1 ]]; then exit_code=2; else exit_code=1; fi - echo "ZFS POOL ALARM: ${error[*]}|${perfdata[*]}"; exit ${exit_code} - else echo "ALL ZFS POOLS OK (${POOLS[*]})|${perfdata[*]}"; exit 0 - fi - -## Check single pool -else - CAPACITY=$(zpool list -Ho capacity $pool | awk -F"%" '{print $1}') - HEALTH=$(zpool list -Ho health $pool) - - if [[ -n $warn ]] && [[ -n $crit ]] - then - # Check with thresholds - if [ $HEALTH != "ONLINE" ]; then echo "ZFS POOL $pool health is $HEALTH|$pool=${CAPACITY}%"; exit ${STATE_CRITICAL} - elif [[ $CAPACITY -gt $crit ]]; then echo "ZFS POOL $pool usage is CRITICAL (${CAPACITY}%|$pool=${CAPACITY}%)"; exit ${STATE_CRITICAL} - elif [[ $CAPACITY -gt $warn && $CAPACITY -lt $crit ]]; then echo "ZFS POOL $pool usage is WARNING (${CAPACITY}%)|$pool=${CAPACITY}%"; exit ${STATE_WARNING} - else echo "ALL ZFS POOLS OK ($pool)|$pool=${CAPACITY}%"; exit ${STATE_OK} - fi - else - # Check without thresholds - if [ $HEALTH != "ONLINE" ] - then echo "ZFS POOL $pool health is $HEALTH|$pool=${CAPACITY}%"; exit ${STATE_CRITICAL} - else echo "ALL ZFS POOLS OK ($pool)|$pool=${CAPACITY}%"; exit ${STATE_OK} - fi - fi - -fi - -echo "UKNOWN - Should never reach this part" -exit ${STATE_UNKNOWN}