From a5ad0a3d487441cf1035ab83f98f3897a619cc5b Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Mon, 3 Jul 2023 11:30:58 +0100 Subject: [PATCH] Tidy alerts and add SSL alert, for #54. --- monitor/prometheus/alert.rules.yml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/monitor/prometheus/alert.rules.yml b/monitor/prometheus/alert.rules.yml index 63e7e64..c1546f6 100644 --- a/monitor/prometheus/alert.rules.yml +++ b/monitor/prometheus/alert.rules.yml @@ -121,6 +121,15 @@ groups: summary: "The FC output volume is filling up with WARCs" description: "The number of WARCs on the FC Gluster volume appears to be increasing: check move-to-hdfs is working as expected." + - alert: tidy-logs_no_new_crawl_logs + expr: delta(ukwa_crawler_log_size_bytes{log='crawl.log'}[1h]) == 0 or absent(ukwa_crawler_log_size_bytes{log='crawl.log'}) + for: 1h + labels: + severity: severe + annotations: + summary: "No new crawl logs from tidy-logs" + description: "{{ $labels.instance }} of job {{ $labels.job }} failed to run." + - name: Generic metrics rules: @@ -180,12 +189,13 @@ groups: summary: "CPU running too hot?" description: "The CPU on {{ $labels.instance }} is running hot (>70C for 30mins)." - - alert: tidy-logs_no_new_crawl_logs - expr: delta(ukwa_crawler_log_size_bytes{log='crawl.log'}[1h]) == 0 or absent(ukwa_crawler_log_size_bytes{log='crawl.log'}) + - alert: ssl_certs_nearing_expiration + expr: (probe_ssl_earliest_cert_expiry - time())/(60*60*24) < 30 for: 1h labels: severity: severe annotations: - summary: "No new crawl logs from tidy-logs" - description: "{{ $labels.instance }} of job {{ $labels.job }} failed to run." + summary: "SSL certificate for {{ $labels.instance }} will expire soon!" + description: "The SSL certificate for {{ $labels.instance }} (part of {{ $labels.job }}) will expire in less than 30 days." +