You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

257 lines
12 KiB

3 years ago
  1. #!/usr/bin/python2
  2. # Examples:
  3. # osd status, warn at 2 missing, crit at 3: ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --osd -w 2 -c 3
  4. # general health statis: /check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --health
  5. # pg status, does not take warning or critical arguments yet. Only warns on PGs not in an active+clean state which means some PGs are not in an optimal state. ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --pg
  6. # extra performance metrics (iops, read/write bytes/sec): ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --perf
  7. # disk space, if run with --pool you only alert on that pool. when run without --pool the thresholds are for every pool. warning and ciritcal are the max avail fields from `ceph df`: ./check_ceph.py -C ceph.conf --id icinga -k ceph.client.icinga.keyring --df -w 100 -c 50
  8. #
  9. #
  10. import sys
  11. import argparse
  12. import json
  13. import subprocess
  14. # ceph osd stat
  15. # ceph mon stat
  16. # ceph pg stat
  17. # ceph health statua
  18. # ceph mon_status
  19. # ceph quorum status
  20. def checkHealth(args):
  21. ceph_health_json = subprocess.check_output(
  22. ["ceph --id {0} -c {1} -k {2} --format json health".format(args.id, args.conf, args.keyring)], shell=True)
  23. ceph_health_dict = json.loads(ceph_health_json)
  24. if ceph_health_dict['status'] == 'HEALTH_ERR':
  25. try:
  26. print "%s: %s" % (ceph_health_dict['overall_status'], ceph_health_dict['summary'][0]['summary'])
  27. except KeyError:
  28. print "%s: %s" % (ceph_health_dict['status'], ceph_health_dict['checks'].keys()[0])
  29. sys.exit(2)
  30. elif ceph_health_dict['status'] == 'HEALTH_WARN':
  31. try:
  32. print "%s: %s" % (ceph_health_dict['overall_status'], ceph_health_dict['summary'][0]['summary'])
  33. except KeyError:
  34. print "%s: %s" % (ceph_health_dict['status'], ceph_health_dict['checks'].keys()[0])
  35. sys.exit(1)
  36. elif ceph_health_dict['status'] == 'HEALTH_OK':
  37. print "%s" % (ceph_health_dict['status'])
  38. sys.exit(0)
  39. def checkOSD(args):
  40. if args.warning:
  41. WARN = float(args.warning)
  42. if args.critical:
  43. CRIT = float(args.critical)
  44. osd_stat_json = subprocess.check_output(
  45. ["ceph --id {0} -c {1} -k {2} --format json osd stat".format(args.id, args.conf, args.keyring)], shell=True)
  46. osd_stat_dict = json.loads(osd_stat_json)
  47. try:
  48. osd_not_up = osd_stat_dict['num_osds'] - osd_stat_dict['num_up_osds']
  49. except KeyError:
  50. osd_stat_dict = osd_stat_dict['osdmap']
  51. osd_not_up = osd_stat_dict['num_osds'] - osd_stat_dict['num_up_osds']
  52. osd_not_in = osd_stat_dict['num_osds'] - osd_stat_dict['num_in_osds']
  53. perf_string = "num_osds={0} num_up_osds={1} num_in_osds={2}".format(
  54. osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'])
  55. # Build in logic to handle the full and near full keys that are returned in the json
  56. if (osd_not_up >= WARN and osd_not_up < CRIT) or (osd_not_in >= WARN and osd_not_in < CRIT):
  57. print "WARNING: ALL OSDs are not up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string)
  58. sys.exit(1)
  59. elif (osd_not_up >= CRIT) or (osd_not_in >= CRIT):
  60. print "CRITICAL: ALL OSDs are not up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string)
  61. sys.exit(2)
  62. elif (osd_stat_dict['num_osds'] == osd_stat_dict['num_in_osds']) and (osd_stat_dict['num_osds'] == osd_stat_dict['num_up_osds']):
  63. print "ALL OSDs are up and in. {0} OSDS. {1} up, {2} in|{3}".format(osd_stat_dict['num_osds'], osd_stat_dict['num_up_osds'], osd_stat_dict['num_in_osds'], perf_string)
  64. sys.exit(0)
  65. else:
  66. print "Script shouldn't reach this point. Thar be bugs!"
  67. sys.exit(3)
  68. def checkMON(args):
  69. if args.warning:
  70. WARN = float(args.warning)
  71. if args.critical:
  72. CRIT = float(args.critical)
  73. # not written yet, more important things
  74. def checkPG(args):
  75. pg_stat_json = subprocess.check_output(
  76. ["ceph --id {0} -c {1} -k {2} --format json pg stat".format(args.id, args.conf, args.keyring)], shell=True)
  77. pg_stat_dict = json.loads(pg_stat_json)
  78. # cheap fix for nautilus change in json output
  79. if 'num_pgs' in pg_stat_dict.keys():
  80. # pre nautilus json format
  81. pg_summary = pg_stat_dict
  82. elif 'pg_summary' in pg_stat_dict.keys():
  83. # nautilus json format
  84. pg_summary = pg_stat_dict['pg_summary']
  85. num_pgs = pg_summary['num_pgs']
  86. active_pgs = 0
  87. perf_string = ""
  88. for x in pg_summary['num_pg_by_state']:
  89. if "active+clean" in x['name']:
  90. active_pgs += x['num']
  91. perf_string += "%s=%s " % (x['name'], x['num'])
  92. # Maybe build in a percentage based threshold for users who want to have thresholds like that
  93. if active_pgs < num_pgs:
  94. print "WARNING: All PGs are not active+clean: {0} PGs Total, {1}|{1}".format(num_pgs, perf_string)
  95. sys.exit(1)
  96. elif active_pgs == num_pgs:
  97. print "All PGs are active+clean: {0} PGs Total, {1}|{1}".format(num_pgs, perf_string)
  98. sys.exit(0)
  99. else:
  100. print "Script shouldn't reach this point. Thar be bugs!"
  101. sys.exit(3)
  102. def checkPerf(args):
  103. pg_stat_json = subprocess.check_output(
  104. ["ceph --id {0} -c {1} -k {2} --format json pg stat".format(args.id, args.conf, args.keyring)], shell=True)
  105. pg_stat_dict = json.loads(pg_stat_json)
  106. if 'read_bytes_sec' not in pg_stat_dict:
  107. pg_stat_dict['read_bytes_sec'] = 0
  108. if 'write_bytes_sec' not in pg_stat_dict:
  109. pg_stat_dict['write_bytes_sec'] = 0
  110. if 'io_sec' not in pg_stat_dict:
  111. pg_stat_dict['io_sec'] = 0
  112. perf_string = "read_bytes_sec={0} write_bytes_sec={1} io_sec={2}".format(
  113. pg_stat_dict['read_bytes_sec'], pg_stat_dict['write_bytes_sec'], pg_stat_dict['io_sec'])
  114. print "Healthy: Additional perf stats for cluster {0}|{0}".format(perf_string)
  115. sys.exit(0)
  116. def checkDF(args):
  117. if args.warning:
  118. WARN = float(args.warning)
  119. if args.critical:
  120. CRIT = float(args.critical)
  121. if args.byte:
  122. if args.byte == "T":
  123. byte_divisor = 1024**4
  124. perf_metric = "TB"
  125. elif args.byte == "G":
  126. byte_divisor = 1024**3
  127. perf_metric = "GB"
  128. elif args.byte == "P":
  129. byte_divisor = 1024**5
  130. perf_metric = "PB"
  131. else:
  132. byte_divisor = 1024**4
  133. perf_metric = "TB"
  134. ceph_df_json = subprocess.check_output(
  135. ["ceph --id {0} -c {1} -k {2} --format json df".format(args.id, args.conf, args.keyring)], shell=True)
  136. ceph_df_dict = json.loads(ceph_df_json)
  137. # get global stats
  138. global_bytes, global_used_bytes, global_avail_bytes = ceph_df_dict['stats'][
  139. 'total_bytes'], ceph_df_dict['stats']['total_used_bytes'], ceph_df_dict['stats']['total_avail_bytes']
  140. global_total = global_bytes/byte_divisor
  141. global_used = global_used_bytes/byte_divisor
  142. global_avail = global_avail_bytes/byte_divisor
  143. # get all pool stats
  144. pool_stats = {}
  145. for pool in ceph_df_dict['pools']:
  146. pool_stats[pool['name']] = {'bytes_used': pool['stats']['bytes_used']/byte_divisor,
  147. 'max_avail': pool['stats']['max_avail']/byte_divisor, 'objects': pool['stats']['objects']}
  148. perf_string = "global_total_bytes={0}{3} global_used_bytes={1}{3} global_avail_bytes={2}{3} ".format(
  149. global_bytes/byte_divisor, global_used_bytes/byte_divisor, global_avail_bytes/byte_divisor, perf_metric)
  150. for item in pool_stats.keys():
  151. perf_string += "{0}_bytes_used={1}{2} {0}_max_avail={3}{2} {0}_objects={4} ".format(
  152. item, pool_stats[item]['bytes_used'], perf_metric, pool_stats[item]['max_avail'], pool_stats[item]['objects'])
  153. # if pool is defined alert on that. if pool is not defined alert on the max_avail of all pools if any cross threshold
  154. if args.pool in pool_stats.keys():
  155. # print pool_stats[args.pool]
  156. # add in percentage later
  157. if (pool_stats[args.pool]['max_avail'] < WARN) and (pool_stats[args.pool]['max_avail'] > CRIT):
  158. print "WARNING: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string)
  159. sys.exit(1)
  160. elif pool_stats[args.pool]['max_avail'] < CRIT:
  161. print "CRITICAL: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string)
  162. sys.exit(2)
  163. elif pool_stats[args.pool]['max_avail'] > WARN:
  164. print "Healthy: Ceph pool {0} has {1}{2} availbale|{3}".format(args.pool, pool_stats[args.pool]['max_avail'], perf_metric, perf_string)
  165. sys.exit(0)
  166. else:
  167. print "Script shouldn't reach this point. Thar be bugs!"
  168. sys.exit(3)
  169. else:
  170. # Alerts based on all pools. If any pool is crossing the threshold we alert on it
  171. warn_list = []
  172. crit_list = []
  173. for key in pool_stats.keys():
  174. if (pool_stats[key]['max_avail'] < WARN) and (pool_stats[key]['max_avail'] > CRIT):
  175. warn_list.append("%s:%s%s" % (
  176. key, pool_stats[key]['max_avail'], perf_metric))
  177. elif pool_stats[key]['max_avail'] < CRIT:
  178. crit_list.append("%s:%s%s" % (
  179. key, pool_stats[key]['max_avail'], perf_metric))
  180. if (len(warn_list) > 0) and (len(crit_list) == 0):
  181. print "WARNING: Ceph pool(s) low on free space. {0}|{1}".format(warn_list, perf_string)
  182. sys.exit(1)
  183. elif len(crit_list) > 0:
  184. print "CRITICAL: Ceph pool(s) critically low on free space. Critial:{0} Warning:{1}|{2}".format(crit_list, warn_list, perf_string)
  185. sys.exit(2)
  186. elif (len(warn_list) == 0) and (len(crit_list) == 0):
  187. print "Healthy: All ceph pools are within free space thresholds|{0}".format(perf_string)
  188. else:
  189. print "Script shouldn't reach this point. Thar be bugs!"
  190. sys.exit(3)
  191. if __name__ == "__main__":
  192. parser = argparse.ArgumentParser(
  193. description='Runs health checks against a ceph cluster. This is designed to run on the monitoring server using the ceph client software. Supply a ceph.conf, keyring, and user to access the cluster.')
  194. parser.add_argument(
  195. '-C', '--conf', help='ceph.conf file, defaults to /etc/ceph/ceph.conf.')
  196. parser.add_argument('-id', '--id', help='Ceph authx user', required=True)
  197. parser.add_argument(
  198. '-k', '--keyring', help='Path to ceph keyring if not in /etc/ceph/client.\$id.keyring')
  199. parser.add_argument(
  200. '--health', help='Get general health status. ex. HEALTH_OK, HEALTH_WARN', action="store_true")
  201. parser.add_argument(
  202. '-o', '--osd', help='OSD status. Thresholds are in number of OSDs missing', action="store_true")
  203. parser.add_argument(
  204. '-m', '--mon', help='MON status. Thesholds are in number of mons missing')
  205. parser.add_argument(
  206. '-p', '--pg', help='PG status. No thresholds due to the large number of pg states.', action="store_true")
  207. parser.add_argument(
  208. '--perf', help='collects additional ceph performance statistics', action='store_true')
  209. parser.add_argument('--df', help='Disk/cluster usage. Reports global and all pools unless --pool is used. Warning and critical are number of -b free to the pools. This is not Raw Free, but Max Avail to the pools based on rep or k,m settings. If you do not define a pool the threshold is run agains all the pools in the cluster.', action="store_true")
  210. parser.add_argument(
  211. '-b', '--byte', help="Format to use for displaying DF data. G=Gigabyte, T=Terabyte. Use with the --df option. Defults to TB")
  212. parser.add_argument('--pool', help='Pool. Use with df')
  213. parser.add_argument('--objects', help='Object counts based on pool')
  214. parser.add_argument(
  215. '-w', '--warning', help='Warning threshold. See specific checks for value types')
  216. parser.add_argument(
  217. '-c', '--critical', help='Critical threshold. See specific checks for value types')
  218. args = parser.parse_args()
  219. if args.health:
  220. checkHealth(args)
  221. elif args.osd:
  222. checkOSD(args)
  223. elif args.pg:
  224. checkPG(args)
  225. elif args.df:
  226. checkDF(args)
  227. elif args.perf:
  228. checkPerf(args)