added managed prometheus alert manager config for memcached and hapro…

…xy (#564) Co-authored-by: Bryant Biggs <bryantbiggs@gmail.com>
aws-ia · May 18, 2022 · afe7184 · afe7184
1 parent 14275d9
commit afe7184
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 0 deletions.
diff --git a/examples/observability/adot-amp-grafana-for-haproxy/main.tf b/examples/observability/adot-amp-grafana-for-haproxy/main.tf
@@ -144,3 +144,78 @@ resource "grafana_dashboard" "haproxy_dashboards" {
   folder      = grafana_folder.haproxy_dashboards.id
   config_json = file("${path.module}/dashboards/default.json")
 }
+
+#Configure AWS Managed Prometheus rule groups
+resource "aws_prometheus_rule_group_namespace" "haproxy" {
+  name         = "haproxy_rules"
+  workspace_id = module.eks_blueprints.amazon_prometheus_workspace_id
+  data         = <<EOF
+  groups:
+  - name: obsa-haproxy-down-alert
+    rules:
+    - alert: HA_proxy_down
+    expr: haproxy_up == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: HAProxy down (instance {{ $labels.instance }})
+      description: "HAProxy down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - name: obsa-haproxy-http4xx-error-alert
+    rules:
+    - alert: Ha_proxy_High_Http4xx_ErrorRate_Backend
+    expr: sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m]) * 100) > 5
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
+      description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - name: obsa-haproxy-http4xx-error-alert
+    rules:
+    - alert: Ha_proxy_High_Http5xx_ErrorRate_Backend
+    expr: sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m]) * 100) > 5
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
+      description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - name: obsa-haproxy-Http4xx-ErrorRate-Server-alert
+    rules:
+    - alert: Ha_proxy_High_Http4xx_ErrorRate_Server
+    expr: sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]) * 100) > 5
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
+      description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+  - name: obsa-haproxy-Http5xx-ErrorRate-Server-alert
+    rules:
+    - alert: Ha_proxy_High_Http5xx_ErrorRate_Server
+    expr: sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]) * 100) > 5
+    for: 1m
+    labels:
+      severity: critical
+    annotations:
+      summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
+      description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+  EOF
+}
+
+#Configure AWS Managed Prometheus alert manager
+resource "aws_prometheus_alert_manager_definition" "haproxy" {
+  workspace_id = module.eks_blueprints.amazon_prometheus_workspace_id
+  definition   = <<EOF
+  alertmanager_config: |
+    route:
+      receiver: 'default'
+    receivers:
+      - name: 'default'
+  EOF
+}
diff --git a/examples/observability/adot-amp-grafana-for-memcached/main.tf b/examples/observability/adot-amp-grafana-for-memcached/main.tf
@@ -144,3 +144,34 @@ resource "grafana_dashboard" "memchached_dashboards" {
   folder      = grafana_folder.memchached_dashboards.id
   config_json = file("${path.module}/dashboards/default.json")
 }
+
+#Configure AWS Managed Prometheus rule groups
+resource "aws_prometheus_rule_group_namespace" "memcached" {
+  name         = "memcached_rules"
+  workspace_id = module.eks_blueprints.amazon_prometheus_workspace_id
+  data         = <<EOF
+  groups:
+  - name: obsa-memcached-down-alert
+    rules:
+    - alert: memcached-down
+    expr: memcached_up == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: memcached down (instance {{ $labels.instance }})
+      description: "memcached instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+  EOF
+}
+
+#Configure AWS Managed Prometheus alert manager
+resource "aws_prometheus_alert_manager_definition" "memcached" {
+  workspace_id = module.eks_blueprints.amazon_prometheus_workspace_id
+  definition   = <<EOF
+  alertmanager_config: |
+    route:
+      receiver: 'default'
+    receivers:
+      - name: 'default'
+  EOF
+}