Skip to content

Commit

Permalink
Getting hadoop3 stats
Browse files Browse the repository at this point in the history
  • Loading branch information
GilHoggarth committed Apr 6, 2023
1 parent 234450b commit f62016f
Show file tree
Hide file tree
Showing 14 changed files with 239 additions and 86 deletions.
30 changes: 30 additions & 0 deletions h3exporter/README
Original file line number Diff line number Diff line change
@@ -1 +1,31 @@
Python3 script to report WA hadoop3 key stats


CURRENTLY, BELOW systemctl SETUP NOT WORK - ISSUE WITH D-BUS AND monitor USER

To set up as a systemctl daemon:
* Ensure user systemd directory exists
* mkdir -p ~/.config/systemd/user

* Create service file as ~/.config/systemd/user/h3exporter.service, chmod 644

```
[Unit]
Description=h3exporter systemctl configuration
After=network.target

[Service]
Type=simple
ExecStart=/home/monitor/github/ukwa-monitor/h3exporter/run_h3exporter.sh
Restart=always
RestartSec=30

[Install]
WantedBy=default.target
```

* Reload user systemctl
* systemctl --user daemon-reload

* Enable and start h3exporter service
* systemctl --user enable --now h3exporter
46 changes: 43 additions & 3 deletions h3exporter/config/settings
Original file line number Diff line number Diff line change
@@ -1,9 +1,49 @@
[dev]
logfpfn = logs/h3exporter.log
loglevel = DEBUG
sleep = 10

# h3nn settings
#h3nnjmx = http://h3nn.api.wa.bl.uk/jmx
#namenodeinfo = {'qry':'Hadoop:service=NameNode', 'name':'NameNodeInfo'}
#fsnamesystem = {'qry':'Hadoop:service=NameNode', 'name':'FSNamesystem'}
namenodeinfo = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo
fsnamesystem = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem

# prometheus settings
pushgtw = http://monitor-pushgateway.dapi.wa.bl.uk
job = h3stats
metric = hadoop3stats
desc = Scrapped hadoop3 stats


[beta]
logfpfn = logs/h3exporter.log
loglevel = DEBUG
host = 0.0.0.0
port = 19120
loglevel = INFO
sleep = 30

# h3nn settings
namenodeinfo = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo
fsnamesystem = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem

# prometheus settings
pushgtw = http://monitor-pushgateway.bapi.wa.bl.uk
job = h3stats
metric = hadoop3stats
desc = Scrapped hadoop3 stats


[prod]
logfpfn = logs/h3exporter.log
loglevel = INFO

# h3nn settings
sleep = 55
namenodeinfo = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo
fsnamesystem = http://h3nn.api.wa.bl.uk/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem

# prometheus settings
pushgtw = http://monitor-pushgateway.api.wa.bl.uk
job = h3stats
metric = hadoop3stats
desc = Scrapped hadoop3 stats
70 changes: 28 additions & 42 deletions h3exporter/h3exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,68 +5,54 @@
'''
import os, sys, logging
import __main__
from http.server import BaseHTTPRequestHandler, HTTPServer
import re
import time

# script modules
from modules import config
from modules import log
from modules import h3stats

# globals
logger = logging.getLogger(__name__)
SCRIPTNAME = os.path.splitext(os.path.basename(__main__.__file__))[0]
CONFIG = 'config/settings'
REQUEST = re.compile("^\w+\s+(/.+)\s+HTTP/\d.\d$")


# classes and functions -----
class webServer(BaseHTTPRequestHandler):
global REQUEST
def _set_headers(self):
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
def do_HEAD(self):
self._set_headers()
def do_GET(self):
self._set_headers()
# grab request
try:
reqMatch = REQUEST.match(self.requestline)
request = reqMatch.group(1)
# process request
if request:
process_request(request)
except Exception as e:
logger.warning(f"Failed to match request in [{self.requestline}]")

def process_request(request):
logger.debug(f"Received request: [{request}]")


# script --------------------
def script():
# read config
global CONFIG
settings = config.settings_read(CONFIG)

# read service environment variables, configure logger
settings = config.settings_read(CONFIG)
log.configure(settings['logfpfn'], settings['loglevel'])
log.start()
log.list_settings(settings)

# create web service
monitorServer = HTTPServer((settings['host'], int(settings['port'])), webServer)
logger.info(f"Started Hadoop3 stats web server: {settings['host']}:{settings['port']}")
try:
monitorServer.serve_forever()
except Exception as e:
logger.warning(f"Hadoop3 stats web server exiting")
logger.warning(f"Message: [{e}]")

# close and end
monitorServer.server_close()
logger.error(f"//////////////////// RUNNING AS DAEMON - SHOULD NEVER FINISH /////////////////////\n")

# previous values
prevUsedPercent = prevDeadNodes = prevUnderReplicated = 0
while True:
# get hadoop3 stats
failStatus, usedPercent, deadNodes, underReplicated = h3stats.get_hadoop_stats(settings)
if failStatus:
logger.warning(f"Failed to get hadoop3 stats")
continue

# send stats to prometheus if change
if ( int(usedPercent) != int(prevUsedPercent) ) \
or ( deadNodes != prevDeadNodes ) \
or ( underReplicated != prevUnderReplicated):
h3stats.send_hadoop_stats(settings, usedPercent, deadNodes, underReplicated)

# update previous values
prevUsedPercent = usedPercent
prevDeadNodes = deadNodes
prevUnderReplicated = underReplicated

# sleep until next time to send hadoop3 stats
time.sleep(int(settings['sleep']))

log.stop('Unexpected stop')

if __name__ == '__main__':
script()
2 changes: 0 additions & 2 deletions h3exporter/modules/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,5 @@ def settings_read(cfgFpfn):
# gather settings
config = cfg[senv]
config['senv'] = senv
# for _k in cfg[senv]:
# config[_k] = cfg[senv][_k]

return config
84 changes: 84 additions & 0 deletions h3exporter/modules/h3stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os, sys, logging
logger = logging.getLogger(__name__)
import requests
import json
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway


# internal functions --------
def _get_url(url):
'''
Seems that hadoop3/jmx "json" isn't wellformed, at least according to
https://jsononline.net/string-to-json and in testing.
So not using qs as requests parameter
(https://docs.python-requests.org/en/latest/user/quickstart/#passing-parameters-in-urls)
but instead using complete url in in requests.get().
'''
res = False
logger.debug(f"Getting [{url}]")
try:
res = requests.get(url)
except Exception as e:
logger.error(f"Failed to get [{url}]")
return False

# check response
if res.status_code != requests.codes.ok:
logger.error(f"Response code not okay for [{url}]")
logger.error(f"Response code: [{res.status_code}]")
return False
else:
return res

# functions -----------------
def get_hadoop_stats(settings):
status = 0
usedPercent = 0
deadNodes = 0
underReplicated = 0

# get urls, check response, gather values
for url in [ settings['namenodeinfo'], settings['fsnamesystem'] ]:
res = _get_url(url)
if not res:
return 1, False, False, False

# annoyingly, res.json() doesn't create nested json, just series of key/value strings
# Consequently, Hadoop3 json blobs (like DeadNodes) must be converted again into json
# This block is wrapped in try/except in case the returned dataset is unexpected
try:
resJson = res.json()
if 'beans' in resJson:
beans = resJson['beans']
for _dbeans in beans:
if 'PercentUsed' in _dbeans:
usedPercent = float(_dbeans['PercentUsed'])
if 'DeadNodes' in _dbeans:
deadNodes = len(json.loads(_dbeans['DeadNodes']))
if 'UnderReplicatedBlocks' in _dbeans:
underReplicated = int(_dbeans['UnderReplicatedBlocks'])
else:
logger.warning(f"No 'beans' in resJson: {resJson}")
status = 1
except Exception as e:
logger.warning(f"Failed traversing response json [{e}]")
status = 1

logger.debug(f"usedPercent:\t\t [{usedPercent}]")
logger.debug(f"deadNodes:\t\t [{deadNodes}]")
logger.debug(f"underReplicated:\t [{underReplicated}]")
return status, usedPercent, deadNodes, underReplicated

def send_hadoop_stats(settings, usedPercent, deadNodes, underReplicated):
registry = CollectorRegistry()
g = Gauge(settings['metric'], settings['desc'], labelnames=['instance'], registry=registry)
g.labels(instance='usedPercent').set(usedPercent)
g.labels(instance='deadNodes').set(deadNodes)
g.labels(instance='underReplicatedBlocks').set(underReplicated)

# push to prometheus
try:
push_to_gateway(settings['pushgtw'], registry=registry, job=settings['job'])
logger.info(f"Pushed to gateway {settings['pushgtw']} {usedPercent}%, deadnodes {deadNodes}, under-rep {underReplicated}")
except Exception as e:
logger.warning(f"Failed push to gateway\nError: [{e}]")
3 changes: 2 additions & 1 deletion h3exporter/modules/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@ def list_settings(settings):
def start():
logger.info(f"Start {'-'*38}")

def stop():
def stop(msg):
if msg: logger.warning(f"{msg}")
logger.info(f"Fin {'-'*40}")
2 changes: 2 additions & 0 deletions h3exporter/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests
prometheus_client
25 changes: 25 additions & 0 deletions h3exporter/run_h3exporter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
#### Swarm environment also determined in script, from hostname, for settings

# setup venv
export PYTHONPATH=/home/monitor/github/ukwa-monitor/h3exporter
source ${PYTHONPATH}/venv/bin/activate

# ensure log directory exists
[[ -d ${PYTHONPATH}/logs/ ]] || mkdir ${PYTHONPATH}/logs

# ensure python libraries installed
cd ${PYTHONPATH}
pip install -r requirements.txt

# run h3exporter script
${PYTHONPATH}/h3exporter.py &
exit 0


# ------
if [[ $(hostname -s) =~ ^(prod|monitor) ]]; then
${PYTHONPATH}/h3exporter.py > /dev/null & # disable generation of large logs over time
else
${PYTHONPATH}/h3exporter.py &
fi
2 changes: 0 additions & 2 deletions monitor/beta/start_monitor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ export DATA_ALERTMANAGER=${DATA_PREFIX}/ukwa-monitor/alertmanager
export HTTP_PROXY=http://explorer2:3128/
export HDFS_EXPORTER='hdfs-exporter.bapi.wa.bl.uk:80'

export HDFS3_EXPORTER='h3exporter.bapi.wa.bl.uk:80'

export ALERT_RECEIVER='beta'
export ALERT_EMAIL_DEV='gil.hoggarth@bl.uk'
export ALERT_EMAIL_BETA='gil.hoggarth@bl.uk'
Expand Down
2 changes: 0 additions & 2 deletions monitor/dev/start_monitor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ export DATA_ALERTMANAGER=${DATA_PREFIX}/ukwa-monitor/alertmanager
export HTTP_PROXY=http://explorer2:3128/
export HDFS_EXPORTER='hdfs-exporter.dapi.wa.bl.uk:80'

export HDFS3_EXPORTER='h3exporter.dapi.wa.bl.uk:80'

export ALERT_RECEIVER='dev'
export ALERT_EMAIL_DEV='gil.hoggarth@bl.uk'
export ALERT_EMAIL_BETA='gil.hoggarth@bl.uk'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,50 +48,52 @@ def script():
xpos = FIRSTCOL # panel x position for hadoop 0.20, 0
ypos = blockRow # panel y position for hadoop 0.20, 0
blockCol = xpos # set block beginning column
replace.title(oH, pnl=panelTitle, title='Hadoop 0.20', h=titleHeight, w=blockWidth, x=xpos, y=ypos) # 0.0
replace.title(oH, pnl=panelTitle, title='Hadoop 0.20', h=titleHeight, w=blockWidth, x=xpos, y=ypos)
ypos += titleHeight # increment ypos by title height
for _title in ['Up', 'CPU', 'Nodes']:
replace.single(oH, pnl=panelStat, job='hadoop', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 0.1, 2.1, 4.1
replace.single(oH, pnl=panelStat, job='hadoop', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH # increment xpos by WIDTH
replace.single(oH, pnl=panelStatHadoopUsed, job='hadoop', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos) # 6.1
replace.single(oH, pnl=panelStatHadoopUsed, job='hadoop', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos)
xpos = blockCol # reset xpos to beginning of block
ypos += HEIGHT # increment ypos by HEIGHT
replace.single(oH, pnl=panelStat, job='hadoop', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 0.3
replace.single(oH, pnl=panelStat, job='hadoop', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH # increment xpos by WIDTH
replace.single(oH, pnl=panelStat, job='hadoop', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 2.3
replace.single(oH, pnl=panelStat, job='hadoop', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH # increment xpos by WIDTH
replace.single(oH, pnl=panelStat, job='hadoop', title='Under-rep', h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 4.3
replace.single(oH, pnl=panelStat, job='hadoop', title='Under-rep', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)

# gluster
xpos = blockWidth # panel x position for gluster, 8
ypos = blockRow # panel x position for gluster, 0
blockCol = xpos
replace.title(oH, pnl=panelTitle, title='Gluster', h=titleHeight, w=blockWidth, x=xpos, y=ypos) # 8.0
replace.title(oH, pnl=panelTitle, title='Gluster', h=titleHeight, w=blockWidth, x=xpos, y=ypos)
ypos += titleHeight
for _title in ['Up', 'CPU', 'Dsk', 'Mem']:
replace.single(oH, pnl=panelStat, job='gluster', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 8.1, 10.1, 12.1, 14.1
replace.single(oH, pnl=panelStat, job='gluster', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH
xpos = blockCol
ypos += HEIGHT
replace.space(oH, pnl=panelSpace, h=HEIGHT, w=blockWidth, x=xpos, y=ypos) # 8.3
replace.space(oH, pnl=panelSpace, h=HEIGHT, w=blockWidth, x=xpos, y=ypos)

## hadoop 3
xpos += blockWidth # panel x position for hadoop 3, blockWidth as previous spacer, 16
ypos = blockRow # panel x position for hadoop 3, 0
blockCol = xpos
replace.title(oH, pnl=panelTitle, title='Hadoop 3', h=titleHeight, w=blockWidth, x=xpos, y=ypos) # 16.0
replace.title(oH, pnl=panelTitle, title='Hadoop 3', h=titleHeight, w=blockWidth, x=xpos, y=ypos)
ypos += titleHeight
for _title in ['Up', 'CPU', 'Dead Nodes']:
replace.single(oH, pnl=panelStat, job='hadoop3', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 16.1, 18.1, 20.1
for _title in ['Up', 'CPU']:
replace.single(oH, pnl=panelStat, job='hadoop3', title=_title, h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH
replace.single(oH, pnl=panelStatHadoopUsed, job='hadoop3', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos) # 22.1
replace.single(oH, pnl=panelStat, job='h3stats', title='Dead Nodes' threshold2='0.9', threshold3='1.2', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH
replace.single(oH, pnl=panelStatHadoopUsed, job='h3stats', title='Used', threshold2='85', threshold3='96', h=4, w=2, x=xpos, y=ypos)
xpos = blockCol
ypos += HEIGHT
replace.single(oH, pnl=panelStat, job='hadoop3', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 16.3
replace.single(oH, pnl=panelStat, job='hadoop3', title='Dsk', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH
replace.single(oH, pnl=panelStat, job='hadoop3', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 18.3
replace.single(oH, pnl=panelStat, job='hadoop3', title='Mem', threshold2='0.1', threshold3='1.1', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)
xpos += WIDTH
replace.single(oH, pnl=panelStat, job='hadoop3', title='Under-rep', h=HEIGHT, w=WIDTH, x=xpos, y=ypos) # 20.3
replace.single(oH, pnl=panelStat, job='h3stats', title='Under-rep', threshold2='9', threshold3='250', h=HEIGHT, w=WIDTH, x=xpos, y=ypos)

# second 'row' of sections -----
blockRow = (titleHeight + HEIGHT + HEIGHT) # blockRow set to exceed above titles and panels, 5
Expand Down
Loading

0 comments on commit f62016f

Please sign in to comment.