From 2dfa4a33295cc688e10ef0f726fa46b68cf8e11a Mon Sep 17 00:00:00 2001 From: Gil Hoggarth Date: Mon, 14 Nov 2022 17:26:45 +0000 Subject: [PATCH 1/6] Updated dev volume path; shifted to logger --- monitor/dev/start_monitor.sh | 2 +- stat-pusher/README.md | 16 +++++++++++++++- stat-pusher/common/log.py | 14 +++++++++++++- stat-pusher/script/args.py | 11 +++++++---- stat-pusher/script/settings.py | 16 ++++++++++------ stat-pusher/script/stat_values.py | 26 +++++++++++++++----------- stat-pusher/settings | 14 ++++++++++++++ 7 files changed, 75 insertions(+), 24 deletions(-) diff --git a/monitor/dev/start_monitor.sh b/monitor/dev/start_monitor.sh index 1fb3f89..7d7e471 100755 --- a/monitor/dev/start_monitor.sh +++ b/monitor/dev/start_monitor.sh @@ -9,7 +9,7 @@ export VISUALIZER_PORT=8081 export GRAFANA_PORT=3000 export PROMETHEUS_PORT=9090 export ALERTMANAGER_PORT=9093 -export DATA_PREFIX=/mnt/nfs/data +export DATA_PREFIX=/mnt/nfs/data/monitor export DATA_GRAFANA=${DATA_PREFIX}/ukwa-monitor/grafana export DATA_PROMETHEUS=${DATA_PREFIX}/ukwa-monitor/prometheus export DATA_ALERTMANAGER=${DATA_PREFIX}/ukwa-monitor/alertmanager diff --git a/stat-pusher/README.md b/stat-pusher/README.md index ddd2655..0894edd 100755 --- a/stat-pusher/README.md +++ b/stat-pusher/README.md @@ -15,4 +15,18 @@ Then run: $ run_stat_pusher.sh dev ``` -Which should grab the stats and push them to the DEV monitor. It relies on `gitlab/ukwa-monitor` to pick up environment variables. \ No newline at end of file +Which should grab the stats and push them to the DEV monitor. It relies on `gitlab/ukwa-monitor` to pick up environment variables. + + + +LDL VM Monitoring Pusher +======================== + +We've always had difficulties tracking the status of DLS LDL VMs, especially their network connectivity. + +Each LDL VM (except NLW as of 2022 Nov 7) now has a root cronjob that curls "http://ld02:8983/wa/monitor?host=$(hostname -f)" (NLS and +NLW going via 'dls-(bsp|lon)-wb02' instead of directly to 'ld02:8983'). + +(ld02 being 'lduwka-proxy', our WA infrastructure server that routes all LDL requests onwards to our WA services via Apache httpd proxypasses.) + +Consequently, the ldl-pusher.py script runs as a daemon, accepts these curl requests, and pushes them into prometheus. diff --git a/stat-pusher/common/log.py b/stat-pusher/common/log.py index 759e1c5..72af8c0 100644 --- a/stat-pusher/common/log.py +++ b/stat-pusher/common/log.py @@ -5,4 +5,16 @@ import logging def configure(lvl='INFO'): - logging.basicConfig(format='[%(asctime)s %(levelname)s] %(message)s', level=lvl) + logging.basicConfig(format='[%(asctime)s %(funcName)s %(levelname)s] %(message)s', level=lvl) + +def configure_file(eset): + scriptHandler = logging.FileHandler(eset['logfpfn']) + formatter = logging.Formatter("[%(asctime)s %(funcName)s %(levelname)s] %(message)s") + scriptHandler.setFormatter(formatter) + logging.root.addHandler(scriptHandler) + logging.root.setLevel(logging.WARNING) + if eset['loglevel'] == 'ERROR': logging.getLogger().setLevel(logging.ERROR) + elif eset['loglevel'] == 'WARNING': logging.getLogger().setLevel(logging.WARNING) + elif eset['loglevel'] == 'DEBUG': logging.getLogger().setLevel(logging.DEBUG) + else: logging.getLogger().setLevel(logging.INFO) + logger = logging.getLogger('__main__') diff --git a/stat-pusher/script/args.py b/stat-pusher/script/args.py index 7857de0..a75f6a6 100644 --- a/stat-pusher/script/args.py +++ b/stat-pusher/script/args.py @@ -7,22 +7,25 @@ import sys import logging +logger = logging.getLogger(__name__) + + def passed(): environ = '' if len(sys.argv) == 2: environ = sys.argv[1] - logging.debug(f"Script argument [{environ}]") + logger.debug(f"Script argument [{environ}]") # test environ value if environ == 'dev' or environ == 'beta' or environ == 'prod': pass else: - logging.error(f"Script environ argument not recognised [{environ}]") + logger.error(f"Script environ argument not recognised [{environ}]") sys.exit() else: - logging.error("Script environ argument not identified") - logging.error(f"sys.argv [{sys.argv}]") + logger.error("Script environ argument not identified") + logger.error(f"sys.argv [{sys.argv}]") sys.exit() return environ diff --git a/stat-pusher/script/settings.py b/stat-pusher/script/settings.py index 496d963..371673a 100644 --- a/stat-pusher/script/settings.py +++ b/stat-pusher/script/settings.py @@ -6,6 +6,10 @@ stgFile = 'settings' environ = '' +logger = logging.getLogger(__name__) + + + # functions ------------------------ def read(env='dev'): global stgFile @@ -14,26 +18,26 @@ def read(env='dev'): # test settings file exists if os.path.isfile(stgFile): - logging.debug(f"Reading [{env}] settings") + logger.debug(f"Reading [{env}] settings") # read environ settings file cfg.read(stgFile) if env in cfg.sections(): environ = cfg[env] else: - logging.error(f"[{env}] section missing from [{stgFile}] settings file") + logger.error(f"[{env}] section missing from [{stgFile}] settings file") sys.exit() else: - logging.error(f"[{stgFile}] settings file missing") + logger.error(f"[{stgFile}] settings file missing") sys.exit() - logging.info(f"Using {env} environment settings") + logger.info(f"Using {env} environment settings") def get(key): global environ if key in environ: - logging.debug(f"setting {key}: [{environ[key]}]") + logger.debug(f"setting {key}: [{environ[key]}]") return environ[key] else: - logging.error(f"No cfg key [{key}] declared") + logger.error(f"No cfg key [{key}] declared") sys.exit() diff --git a/stat-pusher/script/stat_values.py b/stat-pusher/script/stat_values.py index 935b798..4ae7b76 100644 --- a/stat-pusher/script/stat_values.py +++ b/stat-pusher/script/stat_values.py @@ -12,24 +12,28 @@ import dateutil.parser import datetime +logger = logging.getLogger(__name__) + + + def get_json_value(uri, match): - logging.debug(f"uri [{uri}]") + logger.debug(f"uri [{uri}]") # convert match string into list, to traverse uri json response matchList = ast.literal_eval(match) - logging.debug(f"matchList [{matchList}] type [{type(matchList)}]") + logger.debug(f"matchList [{matchList}] type [{type(matchList)}]") # get response try: r = requests.get(uri) - logging.debug(f"Response code [{r.status_code}]") + logger.debug(f"Response code [{r.status_code}]") r.raise_for_status() response = r.json() except HTTPError as he: - logging.error(f"HTTP error trying to get [{uri}]\n[{he}]") + logger.error(f"HTTP error trying to get [{uri}]\n[{he}]") sys.exit() except Exception as e: - logging.error(f"Failed to get [{uri}]\n[{e}]") + logger.error(f"Failed to get [{uri}]\n[{e}]") sys.exit() # extract value @@ -40,9 +44,9 @@ def get_json_value(uri, match): elif k in value[0]: value = value[0][k] else: - logging.error(f"match key [{k}] not found in uri {uri}\njson [{value}]") + logger.error(f"match key [{k}] not found in uri {uri}\njson [{value}]") sys.exit() - logging.debug(f"Value [{value}] type [{type(value)}]") + logger.debug(f"Value [{value}] type [{type(value)}]") # ensure numerical value if type(value) is not int and type(value) is not float: @@ -51,14 +55,14 @@ def get_json_value(uri, match): try: dt = dateutil.parser.parse(value) except Exception as e: - logging.error(f"Value [{value}] type [{type(value)}] not recognised as datestamp") + logger.error(f"Value [{value}] type [{type(value)}] not recognised as datestamp") sys.exit() if isinstance(dt, datetime.datetime): - logging.debug(f"timestamp dt [{dt}] type [{type(dt)}]") + logger.debug(f"timestamp dt [{dt}] type [{type(dt)}]") value = dt.timestamp() - logging.debug(f"Value epoch [{value}]") + logger.debug(f"Value epoch [{value}]") else: - logging.error(f"Value [{value}] type [{type(value)}] not convertible to numeric") + logger.error(f"Value [{value}] type [{type(value)}] not convertible to numeric") sys.exit() return value diff --git a/stat-pusher/settings b/stat-pusher/settings index 53ee2f4..e4ed99c 100644 --- a/stat-pusher/settings +++ b/stat-pusher/settings @@ -2,10 +2,24 @@ pushgtw = http://monitor-pushgateway.dapi.wa.bl.uk statsfile = dev.stats +# ldl server connection testing +logfpfn = /home/monitor/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.log +loglevel = DEBUG +uid = 1000 +gid = 1000 +hostname = 0.0.0.0 +port = 9119 +# prometheus settings +job = ldl_rr_connections +metric = up +desc = Curl requests from LDL VMs, indicating LDL to WA connectivity + + [beta] pushgtw = http://monitor-pushgateway.bapi.wa.bl.uk statsfile = beta.stats + [prod] pushgtw = http://monitor-pushgateway.api.wa.bl.uk statsfile = prod.stats From 534b724f02d17e4df769350950aafe3440aeef98 Mon Sep 17 00:00:00 2001 From: Gil Hoggarth Date: Tue, 15 Nov 2022 14:07:17 +0000 Subject: [PATCH 2/6] ldl-pusher wip --- stat-pusher/ldl-pusher.py | 130 +++++++++++++++++++++++++++++++ stat-pusher/ldl-requirements.txt | 3 + stat-pusher/run_ldl_pusher.sh | 17 ++++ 3 files changed, 150 insertions(+) create mode 100755 stat-pusher/ldl-pusher.py create mode 100644 stat-pusher/ldl-requirements.txt create mode 100755 stat-pusher/run_ldl_pusher.sh diff --git a/stat-pusher/ldl-pusher.py b/stat-pusher/ldl-pusher.py new file mode 100755 index 0000000..2d80594 --- /dev/null +++ b/stat-pusher/ldl-pusher.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +''' +Pushes LDL monitoring curls into prometheus +''' + +import os, sys, logging +import socket, re +import configparser +import daemon, lockfile +from http.server import BaseHTTPRequestHandler, HTTPServer +from prometheus_client import CollectorRegistry, Gauge, push_to_gateway + +from common import log + +# globals +PIDFILE = f"{__file__}.pid" +LOCKFILE = f"{PIDFILE}.lock" +SETTINGSFILE = 'settings' +REQUEST = re.compile("^\w+\s+(/.+)\s+HTTP/\d.\d$") +LDLHOST = re.compile("^/wa/monitor\?host=(.+)$") + +logger = logging.getLogger(__name__) +eset = '' + + +# classes and functions ----------------------- +def _read_settings(environ): + cfg = configparser.ConfigParser() + if os.path.isfile(SETTINGSFILE): + cfg.read(SETTINGSFILE) + if environ in cfg.sections(): + return cfg[environ] + else: + print(f"Section [{environ}] missing from [{SETTINGSFILE}] settings file") + sys.exit(1) + else: + print(f"Settings file [{SETTINGSFILE}] missing") + sys.exit(1) + +class webServer(BaseHTTPRequestHandler): + def _set_headers(self): + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + + def do_HEAD(self): + self._set_headers() + + def do_GET(self): + self._set_headers() + + # grab request + try: + reqMatch = REQUEST.match(self.requestline) + request = reqMatch.group(1) + except Exception as e: + logger.warning(f"Failed to match request in [{self.requestline}]") + + # process request + _process_request(request) + +def _process_request(request): + global eset + logger.debug(f"request: [{request}]") + + # get hostname, skip further processing if fail + ldlHostMatch = LDLHOST.match(request) + if ldlHostMatch: + ldlHost = ldlHostMatch.group(1) + else: + logger.warning(f"Failed to get hostname from [{request}]") + return + logger.debug(f"ldlHost: [{ldlHost}]") + + # set pushgateway values + registry = CollectorRegistry() + g = Gauge(eset['metric'], eset['desc'], labelnames=['instance'], registry=registry) + g.labels(instance=ldlHost).set(1) + + # push to prometheus service + logger.debug(f"pushgtw: [{eset['pushgtw']}]") + logger.info(f"Pushing to gateway [job={eset['job']}, metric={eset['metric']}, request={request}, instance={ldlHost}]") + push_to_gateway(eset['pushgtw'], registry=registry, job=eset['job']) + + + +# script -------------------------------------- +def script(eset): + log.configure_file(eset) + + # create web service + monitorServer = HTTPServer((eset['hostname'], int(eset['port'])), webServer) + logger.info(f"Started LDL monitoring web server [{eset['hostname']}:{eset['port']}]") + try: + monitorServer.serve_forever() + except Exception as e: + logger.warning(f"LDL monitoring web server exiting") + logger.warning(f"Message: [{e}]") + + # close and end + monitorServer.server_close() + logger.info(f"Fin ----\n") + +# main ---------------------------------------- +if __name__ == '__main__': + # check for lockfile + if os.path.exists(LOCKFILE): + print(f"Exiting as [{LOCKFILE}] exists") + sys.exit(1) + + # get swarm environment + senvMatch = re.match('^(dev|beta|prod)', socket.gethostname()) + if senvMatch: + environ = senvMatch.group(1) + else: + print(f"Swarm environment not identified from [{socket.gethostname()}]") + sys.exit(1) + + # read environment settings + eset = _read_settings(environ) + + # run daemon + with daemon.DaemonContext( + stdout = sys.stdout, + stderr = sys.stderr, + uid = int(eset['uid']), + gid = int(eset['gid']), + pidfile = lockfile.FileLock(PIDFILE) + ): + script(eset) diff --git a/stat-pusher/ldl-requirements.txt b/stat-pusher/ldl-requirements.txt new file mode 100644 index 0000000..9f8e2bf --- /dev/null +++ b/stat-pusher/ldl-requirements.txt @@ -0,0 +1,3 @@ +python-daemon +lockfile +prometheus_client diff --git a/stat-pusher/run_ldl_pusher.sh b/stat-pusher/run_ldl_pusher.sh new file mode 100755 index 0000000..5cad055 --- /dev/null +++ b/stat-pusher/run_ldl_pusher.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +#### Swarm environment determined in script, from hostname + +# setup venv +export PYTHONPATH=~/github/ukwa-monitor/stat-pusher +source $PYTHONPATH/venv/bin/activate +cd $PYTHONPATH + +# ensure log directory exists +[[ -d logs/ ]] || mkdir logs + +# ensure python libraries installed +pip install -r ldl-requirements.txt + +# run stat-pusher script +python ldl-pusher.py From 0f564083decc469f7621d9ee30c5f30bd9b52a3e Mon Sep 17 00:00:00 2001 From: Gil Hoggarth Date: Wed, 16 Nov 2022 16:55:03 +0000 Subject: [PATCH 3/6] ldl connection pusher --- .gitignore | 6 ++ monitor/dev/start_monitor.sh | 2 +- .../dashboards/daily_dashboard.json | 8 +-- .../provisioning/dashboards/wa_status.json | 12 ++-- stat-pusher/ldl-pusher.py | 69 ++++++++++++++----- stat-pusher/ldl-requirements.txt | 1 + stat-pusher/run_ldl_pusher.sh | 2 +- stat-pusher/settings | 13 ++-- 8 files changed, 78 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index 2678db8..f6b5c6a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,12 @@ ENV/ *.pyc *out +*log +*pid +*lock +stat-pusher/dev[0-9]-*-[0-9]* +stat-pusher/beta[0-9]-*-[0-9]* +stat-pusher/prod[0-9]-*-[0-9]* */alertmanager/config.yml */grafana/grafana.ini diff --git a/monitor/dev/start_monitor.sh b/monitor/dev/start_monitor.sh index 7d7e471..b28e4a8 100755 --- a/monitor/dev/start_monitor.sh +++ b/monitor/dev/start_monitor.sh @@ -9,7 +9,7 @@ export VISUALIZER_PORT=8081 export GRAFANA_PORT=3000 export PROMETHEUS_PORT=9090 export ALERTMANAGER_PORT=9093 -export DATA_PREFIX=/mnt/nfs/data/monitor +export DATA_PREFIX=/mnt/nfs/data/gilh export DATA_GRAFANA=${DATA_PREFIX}/ukwa-monitor/grafana export DATA_PROMETHEUS=${DATA_PREFIX}/ukwa-monitor/prometheus export DATA_ALERTMANAGER=${DATA_PREFIX}/ukwa-monitor/alertmanager diff --git a/monitor/grafana/provisioning/dashboards/daily_dashboard.json b/monitor/grafana/provisioning/dashboards/daily_dashboard.json index 328829f..3d1322c 100644 --- a/monitor/grafana/provisioning/dashboards/daily_dashboard.json +++ b/monitor/grafana/provisioning/dashboards/daily_dashboard.json @@ -436,7 +436,7 @@ "tableColumn": "", "targets": [ { - "expr": "hdfs_under_replicated_block_count{instance='hdfs-exporter.bapi.wa.bl.uk:80'}", + "expr": "hdfs_under_replicated_block_count{instance='hdfs-exporter.dapi.wa.bl.uk:80'}", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -521,7 +521,7 @@ "tableColumn": "", "targets": [ { - "expr": "hdfs_used_percent{instance='hdfs-exporter.bapi.wa.bl.uk:80'}", + "expr": "hdfs_used_percent{instance='hdfs-exporter.dapi.wa.bl.uk:80'}", "format": "time_series", "intervalFactor": 1, "legendFormat": "", @@ -602,7 +602,7 @@ "tableColumn": "", "targets": [ { - "expr": "hdfs_node_count{status='dead',instance='hdfs-exporter.bapi.wa.bl.uk:80'}", + "expr": "hdfs_node_count{status='dead',instance='hdfs-exporter.dapi.wa.bl.uk:80'}", "format": "time_series", "instant": true, "intervalFactor": 1, @@ -683,7 +683,7 @@ "tableColumn": "", "targets": [ { - "expr": "hdfs_node_count{status='live',instance='hdfs-exporter.bapi.wa.bl.uk:80'}", + "expr": "hdfs_node_count{status='live',instance='hdfs-exporter.dapi.wa.bl.uk:80'}", "format": "time_series", "instant": true, "intervalFactor": 1, diff --git a/monitor/grafana/provisioning/dashboards/wa_status.json b/monitor/grafana/provisioning/dashboards/wa_status.json index 53b282a..34168c1 100644 --- a/monitor/grafana/provisioning/dashboards/wa_status.json +++ b/monitor/grafana/provisioning/dashboards/wa_status.json @@ -266,7 +266,7 @@ "pluginVersion": "7.1.3", "targets": [ { - "expr": "hdfs_node_count{status=\"dead\",instance=\"hdfs-exporter.bapi.wa.bl.uk:80\"}", + "expr": "hdfs_node_count{status=\"dead\",instance=\"hdfs-exporter.dapi.wa.bl.uk:80\"}", "legendFormat": "", "interval": "", "format": "time_series", @@ -293,7 +293,7 @@ "maxDataPoints": 100, "targets": [ { - "expr": "hdfs_used_percent{instance=\"hdfs-exporter.bapi.wa.bl.uk:80\"}", + "expr": "hdfs_used_percent{instance=\"hdfs-exporter.dapi.wa.bl.uk:80\"}", "format": "time_series", "intervalFactor": 1, "refId": "A" @@ -567,7 +567,7 @@ "pluginVersion": "7.1.3", "targets": [ { - "expr": "hdfs_under_replicated_block_count{instance=\"hdfs-exporter.bapi.wa.bl.uk:80\"}", + "expr": "hdfs_under_replicated_block_count{instance=\"hdfs-exporter.dapi.wa.bl.uk:80\"}", "legendFormat": "", "interval": "", "format": "time_series", @@ -1127,7 +1127,7 @@ "pluginVersion": "7.1.3", "targets": [ { - "expr": "hadoop_hdfs_namenode_nninfo_dead_nodes_count{instance=\"h3exporter.bapi.wa.bl.uk:80\"}", + "expr": "hadoop_hdfs_namenode_nninfo_dead_nodes_count{instance=\"h3exporter.dapi.wa.bl.uk:80\"}", "legendFormat": "", "interval": "", "format": "time_series", @@ -1154,7 +1154,7 @@ "maxDataPoints": 100, "targets": [ { - "expr": "round(100 - hadoop_hdfs_namenode_nninfo_percent_remaining{instance=\"h3exporter.bapi.wa.bl.uk:80\"})", + "expr": "round(100 - hadoop_hdfs_namenode_nninfo_percent_remaining{instance=\"h3exporter.dapi.wa.bl.uk:80\"})", "format": "time_series", "intervalFactor": 1, "refId": "A" @@ -1428,7 +1428,7 @@ "pluginVersion": "7.1.3", "targets": [ { - "expr": "hadoop_hdfs_namenode_fsname_system_under_replicated_blocks{instance=\"h3exporter.bapi.wa.bl.uk:80\"}", + "expr": "hadoop_hdfs_namenode_fsname_system_under_replicated_blocks{instance=\"h3exporter.dapi.wa.bl.uk:80\"}", "legendFormat": "", "interval": "", "format": "time_series", diff --git a/stat-pusher/ldl-pusher.py b/stat-pusher/ldl-pusher.py index 2d80594..0fbdcef 100755 --- a/stat-pusher/ldl-pusher.py +++ b/stat-pusher/ldl-pusher.py @@ -8,19 +8,29 @@ import configparser import daemon, lockfile from http.server import BaseHTTPRequestHandler, HTTPServer +import datetime from prometheus_client import CollectorRegistry, Gauge, push_to_gateway from common import log # globals +logger = logging.getLogger(__name__) + PIDFILE = f"{__file__}.pid" LOCKFILE = f"{PIDFILE}.lock" SETTINGSFILE = 'settings' REQUEST = re.compile("^\w+\s+(/.+)\s+HTTP/\d.\d$") LDLHOST = re.compile("^/wa/monitor\?host=(.+)$") +YMDHM = '%Y%m%d%H%M' +SCHEDULE = 15 +INSTANCE = 'ldl_connection_count' -logger = logging.getLogger(__name__) +# environ settings eset = '' +# dldl - dictionary of latest LDL connections +dldl = {'DLS-BSP-WB01':0, 'DLS-BSP-WB02':0, 'DLS-BSP-WB03':0, 'DLS-BSP-WB04':0, 'DLS-LON-WB01':0, 'DLS-LON-WB02':0, 'DLS-LON-WB03':0, 'DLS-LON-WB04':0, 'DLS-NLS-WB01':0, 'DLS-NLW-WB01':0} +# last YYYYMMDDHHMM push to gateway happened +pushymdhm = 0 # classes and functions ----------------------- @@ -60,8 +70,13 @@ def do_GET(self): _process_request(request) def _process_request(request): + global YMDHM + global SCHEDULE + global INSTANCE global eset - logger.debug(f"request: [{request}]") + global dldl + global pushymdhm + logger.debug(f"Received request: {request}") # get hostname, skip further processing if fail ldlHostMatch = LDLHOST.match(request) @@ -70,19 +85,39 @@ def _process_request(request): else: logger.warning(f"Failed to get hostname from [{request}]") return - logger.debug(f"ldlHost: [{ldlHost}]") - - # set pushgateway values - registry = CollectorRegistry() - g = Gauge(eset['metric'], eset['desc'], labelnames=['instance'], registry=registry) - g.labels(instance=ldlHost).set(1) - - # push to prometheus service - logger.debug(f"pushgtw: [{eset['pushgtw']}]") - logger.info(f"Pushing to gateway [job={eset['job']}, metric={eset['metric']}, request={request}, instance={ldlHost}]") - push_to_gateway(eset['pushgtw'], registry=registry, job=eset['job']) - + # get current time + nowymdhm = int((datetime.datetime.now()).strftime(YMDHM)) + + # update LDL in dldl + dldl[ldlHost] = nowymdhm + + # on schedule, report LDL connection status to pushgateway + logger.debug(f"dldl {dldl}") + if (nowymdhm - pushymdhm) > SCHEDULE: + # count LDLs responded in last SCHEDULE period + up = 0 + for _ldl in dldl: + if (nowymdhm - dldl[_ldl]) < SCHEDULE: up += 1 + else: logger.debug(f"LDL [{_ldl}] hasn't curled in {SCHEDULE} minutes") + + # set pushgateway values and push to prometheus service + registry = CollectorRegistry() + g = Gauge(eset['metric'], eset['desc'], labelnames=['instance'], registry=registry) + g.labels(instance=INSTANCE).set(up) + push_to_gateway(eset['pushgtw'], registry=registry, job=eset['job']) + + # write latest push to output file (done via output rather than log so log doesn't + # become huge over time) + with open(eset['output'], 'w') as out: + out.write(f"Output datestamp:\t{nowymdhm}\n") + out.write(f"Pushing to gateway:\tjob={eset['job']}, instance={INSTANCE}, recent_connections={up}\n") + for _ldl in dldl: out.write(f"\t{_ldl}:\t{dldl[_ldl]}\n") + out.write("\n") + out.close() + + # store push time + pushymdhm = nowymdhm # script -------------------------------------- def script(eset): @@ -90,7 +125,7 @@ def script(eset): # create web service monitorServer = HTTPServer((eset['hostname'], int(eset['port'])), webServer) - logger.info(f"Started LDL monitoring web server [{eset['hostname']}:{eset['port']}]") + logger.info(f"Started LDL monitoring web server: {eset['hostname']}:{eset['port']}") try: monitorServer.serve_forever() except Exception as e: @@ -99,13 +134,13 @@ def script(eset): # close and end monitorServer.server_close() - logger.info(f"Fin ----\n") + logger.warning(f"//////////////////// RUNNING AS DAEMON - SHOULD NEVER FINISH /////////////////////\n") # main ---------------------------------------- if __name__ == '__main__': # check for lockfile if os.path.exists(LOCKFILE): - print(f"Exiting as [{LOCKFILE}] exists") + print(f"Exiting as [{LOCKFILE}] exists, service already be running") sys.exit(1) # get swarm environment diff --git a/stat-pusher/ldl-requirements.txt b/stat-pusher/ldl-requirements.txt index 9f8e2bf..4dd84ad 100644 --- a/stat-pusher/ldl-requirements.txt +++ b/stat-pusher/ldl-requirements.txt @@ -1,3 +1,4 @@ python-daemon lockfile +datetime prometheus_client diff --git a/stat-pusher/run_ldl_pusher.sh b/stat-pusher/run_ldl_pusher.sh index 5cad055..556a70f 100755 --- a/stat-pusher/run_ldl_pusher.sh +++ b/stat-pusher/run_ldl_pusher.sh @@ -14,4 +14,4 @@ cd $PYTHONPATH pip install -r ldl-requirements.txt # run stat-pusher script -python ldl-pusher.py +nohup python ldl-pusher.py & diff --git a/stat-pusher/settings b/stat-pusher/settings index e4ed99c..56fd5ce 100644 --- a/stat-pusher/settings +++ b/stat-pusher/settings @@ -3,15 +3,16 @@ pushgtw = http://monitor-pushgateway.dapi.wa.bl.uk statsfile = dev.stats # ldl server connection testing -logfpfn = /home/monitor/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.log -loglevel = DEBUG -uid = 1000 -gid = 1000 +logfpfn = /home/gilh/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.log +output = /home/gilh/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.out +loglevel = INFO +uid = 1004 +gid = 1004 hostname = 0.0.0.0 port = 9119 # prometheus settings -job = ldl_rr_connections -metric = up +job = ldl_rr +metric = recent_connections desc = Curl requests from LDL VMs, indicating LDL to WA connectivity From 8ba8587f8a67b76714c81b5bd425603c5f892245 Mon Sep 17 00:00:00 2001 From: Gil Hoggarth Date: Thu, 17 Nov 2022 12:12:55 +0000 Subject: [PATCH 4/6] Updated status dashboard with ldl count --- .../generate_wa-status_dashboard.py | 4 + .../dashboards/wa_status.json-template | 132 ++++++++++++++---- 2 files changed, 108 insertions(+), 28 deletions(-) diff --git a/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py b/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py index 67a09e3..ee6731e 100755 --- a/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py +++ b/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py @@ -147,6 +147,9 @@ def replace_output_single(outHandle, **kwargs): elif kwargs['title'] == 'Used': expr = 'round(100 - hadoop_hdfs_namenode_nninfo_percent_remaining{instance=\\"${HDFS3_EXPORTER}\\"})' templateCode = templateCode.replace('', expr) + elif kwargs['title'] == 'LDLs': + expr = 'recent_connections{instance=\\"ldl_connection_count\\", job=\\"ldl_rr\\"}' + templateCode = templateCode.replace('', expr) # add last comma if not last panel if 'lastPanel' not in kwargs: @@ -207,6 +210,7 @@ def main(): replace_output_single(outHandle, pnl=panelStat, job='infrastructure', title='CPU', h=2, w=2, x=2, y=9) replace_output_single(outHandle, pnl=panelStat, job='infrastructure', title='Dsk', h=2, w=2, x=4, y=9) replace_output_single(outHandle, pnl=panelStat, job = 'infrastructure', title = 'Mem', h=2, w=2, x=6, y=9) + replace_output_single(outHandle, pnl=panelStat, job='infrastructure', title='LDLs', h=2, w=2, x=0, y=11, textmode='value', colour1='#D44A3A', colour3='#299C46', threshold2='9', threshold3='10') # general services replace_output_title(outHandle, pnl=panelTitle, job='services', title='Services', h=1, w=8, x=8, y=8) replace_output_single(outHandle, pnl=panelStat, job='services', title='Up', h=2, w=2, x=8, y=9) diff --git a/monitor/grafana/provisioning/dashboards/wa_status.json-template b/monitor/grafana/provisioning/dashboards/wa_status.json-template index de48008..114ffc1 100644 --- a/monitor/grafana/provisioning/dashboards/wa_status.json-template +++ b/monitor/grafana/provisioning/dashboards/wa_status.json-template @@ -1782,6 +1782,82 @@ "interval": null, "datasource": null }, + { + "fieldConfig": { + "defaults": { + "custom": {}, + "unit": "none", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": null, + "color": "#D44A3A" + }, + { + "value": 10, + "color": "#ED8027" + }, + { + "value": 11, + "color": "#299C46" + } + ] + }, + "mappings": [ + { + "op": "=", + "text": "N/A", + "value": "null", + "id": 0, + "type": 1 + } + ], + "nullValueMode": "connected" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 0, + "y": 11 + }, + "id": 29, + "links": [], + "maxDataPoints": 100, + "options": { + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "orientation": "horizontal", + "textMode": "value", + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto" + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "recent_connections{instance=\"ldl_connection_count\", job=\"ldl_rr\"}", + "legendFormat": "", + "interval": "", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "type": "stat", + "title": "LDLs", + "cacheTimeout": null, + "interval": null, + "datasource": null + }, { "content": "", "gridPos": { @@ -1790,7 +1866,7 @@ "x": 8, "y": 8 }, - "id": 29, + "id": 30, "links": [], "mode": "markdown", "title": "Services", @@ -1837,7 +1913,7 @@ "x": 8, "y": 9 }, - "id": 30, + "id": 31, "links": [], "maxDataPoints": 100, "options": { @@ -1913,7 +1989,7 @@ "x": 10, "y": 9 }, - "id": 31, + "id": 32, "links": [], "maxDataPoints": 100, "options": { @@ -1989,7 +2065,7 @@ "x": 12, "y": 9 }, - "id": 32, + "id": 33, "links": [], "maxDataPoints": 100, "options": { @@ -2065,7 +2141,7 @@ "x": 14, "y": 9 }, - "id": 33, + "id": 34, "links": [], "maxDataPoints": 100, "options": { @@ -2108,7 +2184,7 @@ "x": 16, "y": 8 }, - "id": 34, + "id": 35, "links": [], "mode": "markdown", "title": "Solr", @@ -2155,7 +2231,7 @@ "x": 16, "y": 9 }, - "id": 35, + "id": 36, "links": [], "maxDataPoints": 100, "options": { @@ -2231,7 +2307,7 @@ "x": 18, "y": 9 }, - "id": 36, + "id": 37, "links": [], "maxDataPoints": 100, "options": { @@ -2307,7 +2383,7 @@ "x": 20, "y": 9 }, - "id": 37, + "id": 38, "links": [], "maxDataPoints": 100, "options": { @@ -2383,7 +2459,7 @@ "x": 22, "y": 9 }, - "id": 38, + "id": 39, "links": [], "maxDataPoints": 100, "options": { @@ -2459,7 +2535,7 @@ "x": 16, "y": 11 }, - "id": 39, + "id": 40, "links": [], "maxDataPoints": 100, "options": { @@ -2502,7 +2578,7 @@ "x": 0, "y": 12 }, - "id": 40, + "id": 41, "options": { "mode": "markdown", "content": "# WA Services" @@ -2523,7 +2599,7 @@ "x": 0, "y": 14 }, - "id": 41, + "id": 42, "links": [], "mode": "markdown", "title": "Ingest & Metadata", @@ -2570,7 +2646,7 @@ "x": 0, "y": 15 }, - "id": 42, + "id": 43, "links": [], "maxDataPoints": 100, "options": { @@ -2646,7 +2722,7 @@ "x": 2, "y": 15 }, - "id": 43, + "id": 44, "links": [], "maxDataPoints": 100, "options": { @@ -2722,7 +2798,7 @@ "x": 4, "y": 15 }, - "id": 44, + "id": 45, "links": [], "maxDataPoints": 100, "options": { @@ -2798,7 +2874,7 @@ "x": 6, "y": 15 }, - "id": 45, + "id": 46, "links": [], "maxDataPoints": 100, "options": { @@ -2874,7 +2950,7 @@ "x": 0, "y": 17 }, - "id": 46, + "id": 47, "links": [], "maxDataPoints": 100, "options": { @@ -2917,7 +2993,7 @@ "x": 8, "y": 14 }, - "id": 47, + "id": 48, "links": [], "mode": "markdown", "title": "TrackDB", @@ -2964,7 +3040,7 @@ "x": 8, "y": 15 }, - "id": 48, + "id": 49, "links": [], "maxDataPoints": 100, "options": { @@ -3040,7 +3116,7 @@ "x": 12, "y": 15 }, - "id": 49, + "id": 50, "links": [], "maxDataPoints": 100, "options": { @@ -3083,7 +3159,7 @@ "x": 16, "y": 14 }, - "id": 50, + "id": 51, "links": [], "mode": "markdown", "title": "Discovery & Access", @@ -3130,7 +3206,7 @@ "x": 16, "y": 15 }, - "id": 51, + "id": 52, "links": [], "maxDataPoints": 100, "options": { @@ -3206,7 +3282,7 @@ "x": 18, "y": 15 }, - "id": 52, + "id": 53, "links": [], "maxDataPoints": 100, "options": { @@ -3282,7 +3358,7 @@ "x": 20, "y": 15 }, - "id": 53, + "id": 54, "links": [], "maxDataPoints": 100, "options": { @@ -3358,7 +3434,7 @@ "x": 22, "y": 15 }, - "id": 54, + "id": 55, "links": [], "maxDataPoints": 100, "options": { @@ -3434,7 +3510,7 @@ "x": 16, "y": 17 }, - "id": 55, + "id": 56, "links": [], "maxDataPoints": 100, "options": { @@ -3510,7 +3586,7 @@ "x": 18, "y": 17 }, - "id": 56, + "id": 57, "links": [], "maxDataPoints": 100, "options": { From e98ed72c3c7b15ad3b91de0a97a1a2224b68e148 Mon Sep 17 00:00:00 2001 From: Gil Hoggarth Date: Thu, 17 Nov 2022 12:52:08 +0000 Subject: [PATCH 5/6] Correction to ldl thresholds --- .../generate_wa-status_dashboard.py | 2 +- .../provisioning/dashboards/wa_status.json | 3589 ----------------- .../dashboards/wa_status.json-template | 4 +- 3 files changed, 3 insertions(+), 3592 deletions(-) delete mode 100644 monitor/grafana/provisioning/dashboards/wa_status.json diff --git a/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py b/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py index ee6731e..dd90da9 100755 --- a/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py +++ b/monitor/grafana/provisioning/_dashboard_generator/generate_wa-status_dashboard.py @@ -210,7 +210,7 @@ def main(): replace_output_single(outHandle, pnl=panelStat, job='infrastructure', title='CPU', h=2, w=2, x=2, y=9) replace_output_single(outHandle, pnl=panelStat, job='infrastructure', title='Dsk', h=2, w=2, x=4, y=9) replace_output_single(outHandle, pnl=panelStat, job = 'infrastructure', title = 'Mem', h=2, w=2, x=6, y=9) - replace_output_single(outHandle, pnl=panelStat, job='infrastructure', title='LDLs', h=2, w=2, x=0, y=11, textmode='value', colour1='#D44A3A', colour3='#299C46', threshold2='9', threshold3='10') + replace_output_single(outHandle, pnl=panelStat, job='infrastructure', title='LDLs', h=2, w=2, x=0, y=11, textmode='value', colour1='#D44A3A', colour3='#299C46', threshold2='8.5', threshold3='9.5') # general services replace_output_title(outHandle, pnl=panelTitle, job='services', title='Services', h=1, w=8, x=8, y=8) replace_output_single(outHandle, pnl=panelStat, job='services', title='Up', h=2, w=2, x=8, y=9) diff --git a/monitor/grafana/provisioning/dashboards/wa_status.json b/monitor/grafana/provisioning/dashboards/wa_status.json deleted file mode 100644 index 34168c1..0000000 --- a/monitor/grafana/provisioning/dashboards/wa_status.json +++ /dev/null @@ -1,3589 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "id": 5, - "links": [], - "panels": [ - { - "type": "text", - "gridPos": { - "h": 2, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 1, - "options": { - "mode": "markdown", - "content": "# Storage" - }, - "transparent": true, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "datasource": null, - "pluginVersion": "7.5.7" - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 0, - "y": 2 - }, - "id": 2, - "links": [], - "mode": "markdown", - "title": "Hadoop 0.20", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 0, - "y": 3 - }, - "id": 3, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"hadoop\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 2, - "y": 3 - }, - "id": 4, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"hadoop\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 4, - "y": 3 - }, - "id": 5, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "hdfs_node_count{status=\"dead\",instance=\"hdfs-exporter.dapi.wa.bl.uk:80\"}", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Nodes", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "gridPos": { - "h": 4, - "w": 2, - "x": 6, - "y": 3 - }, - "id": 6, - "links": [], - "maxDataPoints": 100, - "targets": [ - { - "expr": "hdfs_used_percent{instance=\"hdfs-exporter.dapi.wa.bl.uk:80\"}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "title": "Used", - "type": "stat", - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "auto", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "percent", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 85, - "color": "#ED8027" - }, - { - "value": 90, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "pluginVersion": "7.1.3", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 0, - "y": 5 - }, - "id": 7, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"hadoop\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"hadoop\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1.1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 2, - "y": 5 - }, - "id": 8, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"hadoop\"} + node_memory_Buffers_bytes{job=\"hadoop\"} + node_memory_Cached_bytes{job=\"hadoop\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"hadoop\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 4, - "y": 5 - }, - "id": 9, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "hdfs_under_replicated_block_count{instance=\"hdfs-exporter.dapi.wa.bl.uk:80\"}", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Under-rep", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 8, - "y": 2 - }, - "id": 10, - "links": [], - "mode": "markdown", - "title": "Gluster", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 8, - "y": 3 - }, - "id": 11, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"gluster\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 10, - "y": 3 - }, - "id": 12, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"gluster\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 12, - "y": 3 - }, - "id": 13, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"gluster\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"gluster\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 14, - "y": 3 - }, - "id": 14, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"gluster\"} + node_memory_Buffers_bytes{job=\"gluster\"} + node_memory_Cached_bytes{job=\"gluster\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"gluster\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 16, - "y": 2 - }, - "id": 15, - "links": [], - "mode": "markdown", - "title": "Hadoop 3", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 16, - "y": 3 - }, - "id": 16, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"hadoop3\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 3 - }, - "id": 17, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"hadoop3\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 20, - "y": 3 - }, - "id": 18, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "hadoop_hdfs_namenode_nninfo_dead_nodes_count{instance=\"h3exporter.dapi.wa.bl.uk:80\"}", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dead Nodes", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "gridPos": { - "h": 4, - "w": 2, - "x": 22, - "y": 3 - }, - "id": 19, - "links": [], - "maxDataPoints": 100, - "targets": [ - { - "expr": "round(100 - hadoop_hdfs_namenode_nninfo_percent_remaining{instance=\"h3exporter.dapi.wa.bl.uk:80\"})", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "title": "Used", - "type": "stat", - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "auto", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "percent", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 85, - "color": "#ED8027" - }, - { - "value": 90, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "pluginVersion": "7.1.3", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 16, - "y": 5 - }, - "id": 20, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"hadoop3\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"hadoop3\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1.1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 5 - }, - "id": 21, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"hadoop3\"} + node_memory_Buffers_bytes{job=\"hadoop3\"} + node_memory_Cached_bytes{job=\"hadoop3\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"hadoop3\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 20, - "y": 5 - }, - "id": 22, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "value", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "hadoop_hdfs_namenode_fsname_system_under_replicated_blocks{instance=\"h3exporter.dapi.wa.bl.uk:80\"}", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Under-rep", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "type": "text", - "gridPos": { - "h": 2, - "w": 24, - "x": 0, - "y": 6 - }, - "id": 23, - "options": { - "mode": "markdown", - "content": "# WA Systems" - }, - "transparent": true, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "datasource": null, - "pluginVersion": "7.5.7" - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 0, - "y": 8 - }, - "id": 24, - "links": [], - "mode": "markdown", - "title": "Infrastructure", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 0, - "y": 9 - }, - "id": 25, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"infrastructure\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 2, - "y": 9 - }, - "id": 26, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"infrastructure\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 4, - "y": 9 - }, - "id": 27, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"infrastructure\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"infrastructure\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 6, - "y": 9 - }, - "id": 28, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"infrastructure\"} + node_memory_Buffers_bytes{job=\"infrastructure\"} + node_memory_Cached_bytes{job=\"infrastructure\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"infrastructure\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 8, - "y": 8 - }, - "id": 29, - "links": [], - "mode": "markdown", - "title": "Services", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 8, - "y": 9 - }, - "id": 30, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"services\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 10, - "y": 9 - }, - "id": 31, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"services\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 12, - "y": 9 - }, - "id": 32, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"services\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"services\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 14, - "y": 9 - }, - "id": 33, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"services\"} + node_memory_Buffers_bytes{job=\"services\"} + node_memory_Cached_bytes{job=\"services\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"services\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 16, - "y": 8 - }, - "id": 34, - "links": [], - "mode": "markdown", - "title": "Solr", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 16, - "y": 9 - }, - "id": 35, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"solr\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 9 - }, - "id": 36, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"solr\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 20, - "y": 9 - }, - "id": 37, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"solr\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"solr\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 9 - }, - "id": 38, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"solr\"} + node_memory_Buffers_bytes{job=\"solr\"} + node_memory_Cached_bytes{job=\"solr\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"solr\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 16, - "y": 11 - }, - "id": 39, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(probe_http_status_code{job=\"solr-query\"} != 200) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Query", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "type": "text", - "gridPos": { - "h": 2, - "w": 24, - "x": 0, - "y": 12 - }, - "id": 40, - "options": { - "mode": "markdown", - "content": "# WA Services" - }, - "transparent": true, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "datasource": null, - "pluginVersion": "7.5.7" - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 0, - "y": 14 - }, - "id": 41, - "links": [], - "mode": "markdown", - "title": "Ingest & Metadata", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 0, - "y": 15 - }, - "id": 42, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"ingest_metadata\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ba43a9" - }, - { - "value": 1.1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 2, - "y": 15 - }, - "id": 43, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"ingest_metadata\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 4, - "y": 15 - }, - "id": 44, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"ingest_metadata\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"ingest_metadata\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 6, - "y": 15 - }, - "id": 45, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"ingest_metadata\"} + node_memory_Buffers_bytes{job=\"ingest_metadata\"} + node_memory_Cached_bytes{job=\"ingest_metadata\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"ingest_metadata\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 0, - "y": 17 - }, - "id": 46, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(probe_http_status_code{job=\"im-access-http\"} != 200) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "WWW", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 8, - "y": 14 - }, - "id": 47, - "links": [], - "mode": "markdown", - "title": "TrackDB", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 22, - "color": "#ED8027" - }, - { - "value": 26, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 8, - "y": 15 - }, - "id": 48, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "value", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "(time() - trackdb_refresh_timestamp) / (60*60)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "trackdb refresh", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#D44A3A" - }, - { - "value": 10, - "color": "#ED8027" - }, - { - "value": 100, - "color": "#299C46" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 12, - "y": 15 - }, - "id": 49, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "value", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(trackdb_numFound - (trackdb_numFound offset 1d))", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "trackdb numFound", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "content": "", - "gridPos": { - "h": 1, - "w": 8, - "x": 16, - "y": 14 - }, - "id": 50, - "links": [], - "mode": "markdown", - "title": "Discovery & Access", - "type": "text" - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 16, - "y": 15 - }, - "id": 51, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "sum(1 - up{job=\"discovery_access\"})", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Up", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 15 - }, - "id": 52, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(irate(node_cpu_seconds_total{job=\"discovery_access\",mode=\"idle\"}[5m]) < 0.1) by (instance)) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "CPU", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 20, - "y": 15 - }, - "id": 53, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count((node_filesystem_avail_bytes{job=\"discovery_access\",fstype!~\"tmpfs|rootfs|cifs\"} / node_filesystem_size_bytes{job=\"discovery_access\",fstype!~\"tmpfs|rootfs|cifs\"}) < 0.04) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Dsk", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ba43a9" - }, - { - "value": 1.1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 15 - }, - "id": 54, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(sum(node_memory_MemFree_bytes{job=\"discovery_access\"} + node_memory_Buffers_bytes{job=\"discovery_access\"} + node_memory_Cached_bytes{job=\"discovery_access\"}) by (instance) / sum(node_memory_MemTotal_bytes{job=\"discovery_access\"}) by (instance) < 0.05) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "Mem", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 2.1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 16, - "y": 17 - }, - "id": 55, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(uptimerobot_monitor_up==0) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "UTR", - "cacheTimeout": null, - "interval": null, - "datasource": null - }, - { - "fieldConfig": { - "defaults": { - "custom": {}, - "unit": "none", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "value": null, - "color": "#299C46" - }, - { - "value": 0.1, - "color": "#ED8027" - }, - { - "value": 1.1, - "color": "#D44A3A" - } - ] - }, - "mappings": [ - { - "op": "=", - "text": "N/A", - "value": "null", - "id": 0, - "type": 1 - } - ], - "nullValueMode": "connected" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 17 - }, - "id": 56, - "links": [], - "maxDataPoints": 100, - "options": { - "reduceOptions": { - "values": false, - "calcs": [ - "lastNotNull" - ], - "fields": "" - }, - "orientation": "horizontal", - "textMode": "none", - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto" - }, - "pluginVersion": "7.1.3", - "targets": [ - { - "expr": "count(probe_http_status_code{job=\"da-access-http\"} != 200) OR vector(0)", - "legendFormat": "", - "interval": "", - "format": "time_series", - "instant": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "type": "stat", - "title": "WWW", - "cacheTimeout": null, - "interval": null, - "datasource": null - } - ], - "refresh": "1m", - "schemaVersion": 16, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "WA Status", - "uid": "wast", - "version": 3 -} diff --git a/monitor/grafana/provisioning/dashboards/wa_status.json-template b/monitor/grafana/provisioning/dashboards/wa_status.json-template index 114ffc1..32d7bd7 100644 --- a/monitor/grafana/provisioning/dashboards/wa_status.json-template +++ b/monitor/grafana/provisioning/dashboards/wa_status.json-template @@ -1795,11 +1795,11 @@ "color": "#D44A3A" }, { - "value": 10, + "value": 8.5, "color": "#ED8027" }, { - "value": 11, + "value": 9.5, "color": "#299C46" } ] From b5153ffebd09b24e27c7793182d97d99cd167bfa Mon Sep 17 00:00:00 2001 From: Gil Hoggarth Date: Thu, 17 Nov 2022 12:59:25 +0000 Subject: [PATCH 6/6] Shifted schedule to settings; Improved debug logging; Added prod settings --- stat-pusher/ldl-pusher.py | 39 ++++++++++++++++++++++++----------- stat-pusher/run_ldl_pusher.sh | 6 +++++- stat-pusher/settings | 18 +++++++++++++++- 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/stat-pusher/ldl-pusher.py b/stat-pusher/ldl-pusher.py index 0fbdcef..ea7022d 100755 --- a/stat-pusher/ldl-pusher.py +++ b/stat-pusher/ldl-pusher.py @@ -20,9 +20,9 @@ LOCKFILE = f"{PIDFILE}.lock" SETTINGSFILE = 'settings' REQUEST = re.compile("^\w+\s+(/.+)\s+HTTP/\d.\d$") -LDLHOST = re.compile("^/wa/monitor\?host=(.+)$") +HOSTREQ = re.compile("^/wa/monitor\?host=(.+)$") +LDLHOST = re.compile("^DLS-(BSP|LON|NLS|NLW)-WB0[1-4]$") YMDHM = '%Y%m%d%H%M' -SCHEDULE = 15 INSTANCE = 'ldl_connection_count' # environ settings @@ -48,6 +48,7 @@ def _read_settings(environ): sys.exit(1) class webServer(BaseHTTPRequestHandler): + global REQUEST def _set_headers(self): self.send_response(200) self.send_header("Content-type", "text/html") @@ -70,8 +71,9 @@ def do_GET(self): _process_request(request) def _process_request(request): + global HOSTREQ + global LDLHOST global YMDHM - global SCHEDULE global INSTANCE global eset global dldl @@ -79,13 +81,21 @@ def _process_request(request): logger.debug(f"Received request: {request}") # get hostname, skip further processing if fail - ldlHostMatch = LDLHOST.match(request) - if ldlHostMatch: - ldlHost = ldlHostMatch.group(1) + hostReqMatch = HOSTREQ.match(request) + if hostReqMatch: + hostReq = hostReqMatch.group(1) else: logger.warning(f"Failed to get hostname from [{request}]") return + # check hostname is LDL VM + ldlHostMatch = LDLHOST.match(hostReq) + if ldlHostMatch: + ldlHost = hostReq + else: + logger.warning(f"Skipping non LDL DLS VM hostname [{hostReq}]") + return + # get current time nowymdhm = int((datetime.datetime.now()).strftime(YMDHM)) @@ -93,26 +103,29 @@ def _process_request(request): dldl[ldlHost] = nowymdhm # on schedule, report LDL connection status to pushgateway + schedule = int(eset['schedule']) logger.debug(f"dldl {dldl}") - if (nowymdhm - pushymdhm) > SCHEDULE: - # count LDLs responded in last SCHEDULE period + logger.debug(f"Schedule: [{nowymdhm} - {pushymdhm}] = [{nowymdhm - pushymdhm}], schedule [{schedule}]") + if (nowymdhm - pushymdhm) > schedule: + # count LDLs responded in last schedule period up = 0 for _ldl in dldl: - if (nowymdhm - dldl[_ldl]) < SCHEDULE: up += 1 - else: logger.debug(f"LDL [{_ldl}] hasn't curled in {SCHEDULE} minutes") + if (nowymdhm - dldl[_ldl]) < schedule: up += 1 + else: logger.debug(f"LDL [{_ldl}] hasn't curled in {schedule} minutes") # set pushgateway values and push to prometheus service registry = CollectorRegistry() g = Gauge(eset['metric'], eset['desc'], labelnames=['instance'], registry=registry) g.labels(instance=INSTANCE).set(up) push_to_gateway(eset['pushgtw'], registry=registry, job=eset['job']) + logger.debug(f"Pushed to gateway:\tjob={eset['job']}, instance={INSTANCE}, recent_connections={up}\n") # write latest push to output file (done via output rather than log so log doesn't # become huge over time) with open(eset['output'], 'w') as out: out.write(f"Output datestamp:\t{nowymdhm}\n") - out.write(f"Pushing to gateway:\tjob={eset['job']}, instance={INSTANCE}, recent_connections={up}\n") - for _ldl in dldl: out.write(f"\t{_ldl}:\t{dldl[_ldl]}\n") + out.write(f"Pushed to gateway:\tjob={eset['job']}, instance={INSTANCE}, recent_connections={up}\n") + for _ldl in dldl: out.write(f"\t{_ldl}:\t{dldl[_ldl]}\tRecent [{(nowymdhm - dldl[_ldl]) < schedule}]\n") out.write("\n") out.close() @@ -121,11 +134,13 @@ def _process_request(request): # script -------------------------------------- def script(eset): + global pushymdhm log.configure_file(eset) # create web service monitorServer = HTTPServer((eset['hostname'], int(eset['port'])), webServer) logger.info(f"Started LDL monitoring web server: {eset['hostname']}:{eset['port']}") + logger.debug(f"Pushing to gateway every [{eset['schedule']}] minutes") try: monitorServer.serve_forever() except Exception as e: diff --git a/stat-pusher/run_ldl_pusher.sh b/stat-pusher/run_ldl_pusher.sh index 556a70f..84c0ef6 100755 --- a/stat-pusher/run_ldl_pusher.sh +++ b/stat-pusher/run_ldl_pusher.sh @@ -14,4 +14,8 @@ cd $PYTHONPATH pip install -r ldl-requirements.txt # run stat-pusher script -nohup python ldl-pusher.py & +if [[ ${HOSTNAME} =~ ^prod ]]; then + nohup python ldl-pusher.py > /dev/null & # disable generation of large logs over time +else + nohup python ldl-pusher.py & +fi diff --git a/stat-pusher/settings b/stat-pusher/settings index 56fd5ce..883a484 100644 --- a/stat-pusher/settings +++ b/stat-pusher/settings @@ -5,12 +5,13 @@ statsfile = dev.stats # ldl server connection testing logfpfn = /home/gilh/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.log output = /home/gilh/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.out -loglevel = INFO +loglevel = DEBUG uid = 1004 gid = 1004 hostname = 0.0.0.0 port = 9119 # prometheus settings +schedule = 5 job = ldl_rr metric = recent_connections desc = Curl requests from LDL VMs, indicating LDL to WA connectivity @@ -24,3 +25,18 @@ statsfile = beta.stats [prod] pushgtw = http://monitor-pushgateway.api.wa.bl.uk statsfile = prod.stats + +# ldl server connection testing +logfpfn = /home/monitor/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.log +output = /home/monitor/github/ukwa-monitor/stat-pusher/logs/ldl-pusher.out +loglevel = INFO +uid = 1000 +gid = 1000 +hostname = 0.0.0.0 +port = 9119 +# prometheus settings +schedule = 20 +job = ldl_rr +metric = recent_connections +desc = Curl requests from LDL VMs, indicating LDL to WA connectivity +