Getting hadoop3 stats

ukwa · Apr 6, 2023 · f62016f · f62016f
1 parent 234450b
commit f62016f
Show file tree

Hide file tree

Showing 14 changed files with 239 additions and 86 deletions.
diff --git a/h3exporter/README b/h3exporter/README
@@ -1 +1,31 @@
 Python3 script to report WA hadoop3 key stats
+
+
+CURRENTLY, BELOW systemctl SETUP NOT WORK - ISSUE WITH D-BUS AND monitor USER
+
+To set up as a systemctl daemon:
+* Ensure user systemd directory exists
+ * mkdir -p ~/.config/systemd/user
+
+* Create service file as ~/.config/systemd/user/h3exporter.service, chmod 644
+
+```
+[Unit]
+Description=h3exporter systemctl configuration
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=/home/monitor/github/ukwa-monitor/h3exporter/run_h3exporter.sh
+Restart=always
+RestartSec=30
+
+[Install]
+WantedBy=default.target
+```
+
+* Reload user systemctl 
+ * systemctl --user daemon-reload
+
+* Enable and start h3exporter service
+ * systemctl --user enable --now h3exporter
diff --git a/h3exporter/config/settings b/h3exporter/config/settings
@@ -1,9 +1,49 @@
 [dev]
 logfpfn = logs/h3exporter.log
 loglevel = DEBUG
+sleep = 10
+
+# h3nn settings
+#h3nnjmx = http://h3nn.api.wa.bl.uk/jmx
+#namenodeinfo = {'qry':'Hadoop:service=NameNode', 'name':'NameNodeInfo'}
+#fsnamesystem = {'qry':'Hadoop:service=NameNode', 'name':'FSNamesystem'}
+namenodeinfo = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo
+fsnamesystem = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem
+
+# prometheus settings
+pushgtw = http://monitor-pushgateway.dapi.wa.bl.uk
+job = h3stats
+metric = hadoop3stats
+desc = Scrapped hadoop3 stats
+
 
 [beta]
 logfpfn = logs/h3exporter.log
-loglevel = DEBUG
-host = 0.0.0.0
-port = 19120
+loglevel = INFO
+sleep = 30
+
+# h3nn settings
+namenodeinfo = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo
+fsnamesystem = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem
+
+# prometheus settings
+pushgtw = http://monitor-pushgateway.bapi.wa.bl.uk
+job = h3stats
+metric = hadoop3stats
+desc = Scrapped hadoop3 stats
+
+
+[prod]
+logfpfn = logs/h3exporter.log
+loglevel = INFO
+
+# h3nn settings
+sleep = 55
+namenodeinfo = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo
+fsnamesystem = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem
+
+# prometheus settings
+pushgtw = http://monitor-pushgateway.api.wa.bl.uk
+job = h3stats
+metric = hadoop3stats
+desc = Scrapped hadoop3 stats
diff --git a/h3exporter/h3exporter.py b/h3exporter/h3exporter.py
@@ -5,68 +5,54 @@
 '''
 import os, sys, logging
 import __main__
-from http.server import BaseHTTPRequestHandler, HTTPServer
-import re
+import time
 
 # script modules
 from modules import config
 from modules import log
+from modules import h3stats
 
 # globals 
 logger = logging.getLogger(__name__)
 SCRIPTNAME = os.path.splitext(os.path.basename(__main__.__file__))[0]
 CONFIG = 'config/settings'
-REQUEST = re.compile("^\w+\s+(/.+)\s+HTTP/\d.\d$")
-
-
-# classes and functions -----
-class webServer(BaseHTTPRequestHandler):
-	global REQUEST
-	def _set_headers(self):
-		self.send_response(200)
-		self.send_header("Content-type", "text/html")
-		self.end_headers()
-	def do_HEAD(self):
-		self._set_headers()
-	def do_GET(self):
-		self._set_headers()
-		# grab request
-		try:
-			reqMatch = REQUEST.match(self.requestline)
-			request = reqMatch.group(1)
-			# process request
-			if request:
-				process_request(request)
-		except Exception as e:
-			logger.warning(f"Failed to match request in [{self.requestline}]")
-
-def process_request(request):
-	logger.debug(f"Received request: [{request}]")
 
 
 # script --------------------
 def script():
+	# read config
 	global CONFIG
+	settings = config.settings_read(CONFIG)
 
 	# read service environment variables, configure logger
-	settings = config.settings_read(CONFIG)
 	log.configure(settings['logfpfn'], settings['loglevel'])
 	log.start()
 	log.list_settings(settings)
 
-	# create web service
-	monitorServer = HTTPServer((settings['host'], int(settings['port'])), webServer)
-	logger.info(f"Started Hadoop3 stats web server:  {settings['host']}:{settings['port']}")
-	try:
-		monitorServer.serve_forever()
-	except Exception as e:
-		logger.warning(f"Hadoop3 stats web server exiting")
-		logger.warning(f"Message: [{e}]")
-
-	# close and end
-	monitorServer.server_close()
-	logger.error(f"//////////////////// RUNNING AS DAEMON - SHOULD NEVER FINISH /////////////////////\n")
-
+	# previous values
+	prevUsedPercent = prevDeadNodes = prevUnderReplicated = 0
+	while True:
+		# get hadoop3 stats
+		failStatus, usedPercent, deadNodes, underReplicated = h3stats.get_hadoop_stats(settings)
+		if failStatus:
+			logger.warning(f"Failed to get hadoop3 stats")
+			continue
+
+		# send stats to prometheus if change
+		if ( int(usedPercent) != int(prevUsedPercent) ) \
+			or ( deadNodes != prevDeadNodes ) \
+			or ( underReplicated != prevUnderReplicated):
+			h3stats.send_hadoop_stats(settings, usedPercent, deadNodes, underReplicated)
+
+		# update previous values
+		prevUsedPercent = usedPercent
+		prevDeadNodes = deadNodes
+		prevUnderReplicated = underReplicated
+
+		# sleep until next time to send hadoop3 stats
+		time.sleep(int(settings['sleep']))
+
+	log.stop('Unexpected stop')
 
 if __name__ == '__main__':
 	script()
diff --git a/h3exporter/modules/config.py b/h3exporter/modules/config.py
@@ -29,7 +29,5 @@ def settings_read(cfgFpfn):
 	# gather settings
 	config = cfg[senv]
 	config['senv'] = senv
-#	for _k in cfg[senv]:
-#		config[_k] = cfg[senv][_k]
 
 	return config
diff --git a/h3exporter/modules/h3stats.py b/h3exporter/modules/h3stats.py
@@ -0,0 +1,84 @@
+import os, sys, logging
+logger = logging.getLogger(__name__)
+import requests
+import json
+from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
+
+
+# internal functions --------
+def _get_url(url):
+	'''
+	Seems that hadoop3/jmx "json" isn't wellformed, at least according to
+	https://jsononline.net/string-to-json and in testing.
+	So not using qs as requests parameter
+	(https://docs.python-requests.org/en/latest/user/quickstart/#passing-parameters-in-urls)
+	but instead using complete url in in requests.get().
+	'''
+	res = False
+	logger.debug(f"Getting [{url}]")
+	try:
+		res = requests.get(url)
+	except Exception as e:
+		logger.error(f"Failed to get [{url}]")
+		return False
+
+	# check response
+	if res.status_code != requests.codes.ok:
+		logger.error(f"Response code not okay for [{url}]")
+		logger.error(f"Response code: [{res.status_code}]")
+		return False
+	else:
+		return res
+
+# functions -----------------
+def get_hadoop_stats(settings):
+	status = 0
+	usedPercent = 0
+	deadNodes = 0
+	underReplicated = 0
+
+	# get urls, check response, gather values
+	for url in [ settings['namenodeinfo'], settings['fsnamesystem'] ]:
+		res = _get_url(url)
+		if not res:
+			return 1, False, False, False
+
+		# annoyingly, res.json() doesn't create nested json, just series of key/value strings
+		# Consequently, Hadoop3 json blobs (like DeadNodes) must be converted again into json
+		# This block is wrapped in try/except in case the returned dataset is unexpected
+		try:
+			resJson = res.json()
+			if 'beans' in resJson:
+				beans = resJson['beans']
+				for _dbeans in beans:
+					if 'PercentUsed' in _dbeans:
+						usedPercent = float(_dbeans['PercentUsed'])
+					if 'DeadNodes' in _dbeans:
+						deadNodes = len(json.loads(_dbeans['DeadNodes']))
+					if 'UnderReplicatedBlocks' in _dbeans:
+						underReplicated = int(_dbeans['UnderReplicatedBlocks'])
+			else:
+				logger.warning(f"No 'beans' in resJson: {resJson}")
+				status = 1
+		except Exception as e:
+			logger.warning(f"Failed traversing response json [{e}]")
+			status = 1
+
+	logger.debug(f"usedPercent:\t\t [{usedPercent}]")
+	logger.debug(f"deadNodes:\t\t [{deadNodes}]")
+	logger.debug(f"underReplicated:\t [{underReplicated}]")
+	return status, usedPercent, deadNodes, underReplicated
+
+def send_hadoop_stats(settings, usedPercent, deadNodes, underReplicated):
+	registry = CollectorRegistry()
+	g = Gauge(settings['metric'], settings['desc'], labelnames=['instance'], registry=registry)
+	g.labels(instance='usedPercent').set(usedPercent)
+	g.labels(instance='deadNodes').set(deadNodes)
+	g.labels(instance='underReplicatedBlocks').set(underReplicated)
+
+	# push to prometheus
+	try:
+		push_to_gateway(settings['pushgtw'], registry=registry, job=settings['job'])
+		logger.info(f"Pushed to gateway {settings['pushgtw']} {usedPercent}%, deadnodes {deadNodes}, under-rep {underReplicated}")
+	except Exception as e:
+		logger.warning(f"Failed push to gateway\nError: [{e}]")
diff --git a/h3exporter/modules/log.py b/h3exporter/modules/log.py
@@ -22,5 +22,6 @@ def list_settings(settings):
 def start():
 	logger.info(f"Start {'-'*38}")
 
-def stop():
+def stop(msg):
+	if msg: logger.warning(f"{msg}")
 	logger.info(f"Fin {'-'*40}")
diff --git a/h3exporter/requirements.txt b/h3exporter/requirements.txt
@@ -0,0 +1,2 @@
+requests
+prometheus_client
diff --git a/h3exporter/run_h3exporter.sh b/h3exporter/run_h3exporter.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+#### Swarm environment also determined in script, from hostname, for settings
+
+# setup venv
+export PYTHONPATH=/home/monitor/github/ukwa-monitor/h3exporter
+source ${PYTHONPATH}/venv/bin/activate
+
+# ensure log directory exists
+[[ -d ${PYTHONPATH}/logs/ ]] || mkdir  ${PYTHONPATH}/logs
+
+# ensure python libraries installed
+cd ${PYTHONPATH}
+pip install -r requirements.txt
+
+# run h3exporter script
+${PYTHONPATH}/h3exporter.py & 
+exit 0
+
+
+# ------
+if [[ $(hostname -s) =~ ^(prod|monitor) ]]; then
+	${PYTHONPATH}/h3exporter.py  > /dev/null &	# disable generation of large logs over time
+else
+	${PYTHONPATH}/h3exporter.py & 
+fi
diff --git a/monitor/beta/start_monitor.sh b/monitor/beta/start_monitor.sh
@@ -16,8 +16,6 @@ export DATA_ALERTMANAGER=${DATA_PREFIX}/ukwa-monitor/alertmanager
 export HTTP_PROXY=http://explorer2:3128/
 export HDFS_EXPORTER='hdfs-exporter.bapi.wa.bl.uk:80'
 
-export HDFS3_EXPORTER='h3exporter.bapi.wa.bl.uk:80'
-
 export ALERT_RECEIVER='beta'
 export ALERT_EMAIL_DEV='gil.hoggarth@bl.uk'
 export ALERT_EMAIL_BETA='gil.hoggarth@bl.uk'

diff --git a/monitor/dev/start_monitor.sh b/monitor/dev/start_monitor.sh
@@ -16,8 +16,6 @@ export DATA_ALERTMANAGER=${DATA_PREFIX}/ukwa-monitor/alertmanager
 export HTTP_PROXY=http://explorer2:3128/
 export HDFS_EXPORTER='hdfs-exporter.dapi.wa.bl.uk:80'
 
-export HDFS3_EXPORTER='h3exporter.dapi.wa.bl.uk:80'
-
 export ALERT_RECEIVER='dev'
 export ALERT_EMAIL_DEV='gil.hoggarth@bl.uk'
 export ALERT_EMAIL_BETA='gil.hoggarth@bl.uk'

diff --git a/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard_g7.py b/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard_g7.py
@@ -48,50 +48,52 @@ def script():
 	xpos = FIRSTCOL		# panel x position for hadoop 0.20, 0
 	ypos = blockRow		# panel y position for hadoop 0.20, 0
 	blockCol = xpos		# set block beginning column
-	replace.title(oH, pnl=panelTitle, title='Hadoop 0.20', h=titleHeight, w=blockWidth, x=xpos, y=ypos)					# 0.0
+	replace.title(oH, pnl=panelTitle, title='Hadoop 0.20', h=titleHeight, w=blockWidth, x=xpos, y=ypos)
 	ypos += titleHeight	# increment ypos by title height
 	for _title in ['Up', 'CPU', 'Nodes']:
-		replace.single(oH, pnl=panelStat, job='hadoop', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)				# 0.1, 2.1, 4.1
+		replace.single(oH, pnl=panelStat, job='hadoop', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 		xpos += WIDTH	# increment xpos by WIDTH
-	replace.single(oH, pnl=panelStatHadoopUsed, job='hadoop', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos)	# 6.1
+	replace.single(oH, pnl=panelStatHadoopUsed, job='hadoop', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos)
 	xpos = blockCol		# reset xpos to beginning of block
 	ypos += HEIGHT		# increment ypos by HEIGHT
-	replace.single(oH, pnl=panelStat, job='hadoop', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)						# 0.3
+	replace.single(oH, pnl=panelStat, job='hadoop', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 	xpos += WIDTH		# increment xpos by WIDTH
-	replace.single(oH, pnl=panelStat, job='hadoop', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)	# 2.3
+	replace.single(oH, pnl=panelStat, job='hadoop', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 	xpos += WIDTH		# increment xpos by WIDTH
-	replace.single(oH, pnl=panelStat, job='hadoop', title='Under-rep', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)					# 4.3
+	replace.single(oH, pnl=panelStat, job='hadoop', title='Under-rep', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 
 	# gluster
 	xpos = blockWidth	# panel x position for gluster, 8
 	ypos = blockRow		# panel x position for gluster, 0
 	blockCol = xpos
-	replace.title(oH, pnl=panelTitle, title='Gluster', h=titleHeight, w=blockWidth, x=xpos, y=ypos)						# 8.0
+	replace.title(oH, pnl=panelTitle, title='Gluster', h=titleHeight, w=blockWidth, x=xpos, y=ypos)
 	ypos += titleHeight
 	for _title in ['Up', 'CPU', 'Dsk', 'Mem']:
-		replace.single(oH, pnl=panelStat, job='gluster', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)				# 8.1, 10.1, 12.1, 14.1
+		replace.single(oH, pnl=panelStat, job='gluster', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 		xpos += WIDTH
 	xpos = blockCol
 	ypos += HEIGHT
-	replace.space(oH, pnl=panelSpace, h=HEIGHT, w=blockWidth, x=xpos, y=ypos)								# 8.3
+	replace.space(oH, pnl=panelSpace, h=HEIGHT, w=blockWidth, x=xpos, y=ypos)
 
 	## hadoop 3
 	xpos += blockWidth	# panel x position for hadoop 3, blockWidth as previous spacer, 16
 	ypos = blockRow		# panel x position for hadoop 3, 0
 	blockCol = xpos
-	replace.title(oH, pnl=panelTitle, title='Hadoop 3', h=titleHeight, w=blockWidth, x=xpos, y=ypos)					# 16.0
+	replace.title(oH, pnl=panelTitle, title='Hadoop 3', h=titleHeight, w=blockWidth, x=xpos, y=ypos)
 	ypos += titleHeight
-	for _title in ['Up', 'CPU', 'Dead Nodes']:
-		replace.single(oH, pnl=panelStat, job='hadoop3', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)				# 16.1, 18.1, 20.1
+	for _title in ['Up', 'CPU']:
+		replace.single(oH, pnl=panelStat, job='hadoop3', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 		xpos += WIDTH
-	replace.single(oH, pnl=panelStatHadoopUsed, job='hadoop3', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos)	# 22.1
+	replace.single(oH, pnl=panelStat, job='h3stats', title='Dead Nodes' threshold2='0.9', threshold3='1.2', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
+	xpos += WIDTH
+	replace.single(oH, pnl=panelStatHadoopUsed, job='h3stats', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos)
 	xpos = blockCol
 	ypos += HEIGHT
-	replace.single(oH, pnl=panelStat, job='hadoop3', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)					# 16.3
+	replace.single(oH, pnl=panelStat, job='hadoop3', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 	xpos += WIDTH
-	replace.single(oH, pnl=panelStat, job='hadoop3', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)	# 18.3
+	replace.single(oH, pnl=panelStat, job='hadoop3', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 	xpos += WIDTH
-	replace.single(oH, pnl=panelStat, job='hadoop3', title='Under-rep', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)					# 20.3
+	replace.single(oH, pnl=panelStat, job='h3stats', title='Under-rep', threshold2='9', threshold3='250', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
 
 	# second 'row' of sections -----
 	blockRow = (titleHeight + HEIGHT + HEIGHT)	# blockRow set to exceed above titles and panels, 5