diff --git a/examples/all_in_one/README.md b/examples/all_in_one/README.md index 6420a95..56509ed 100644 --- a/examples/all_in_one/README.md +++ b/examples/all_in_one/README.md @@ -1,4 +1,31 @@ -# TODO +# All-in-one example +### Overview +Use the provided [docker-compose](./docker-compose.yaml) to start the complete setup with Prometheus instance loaded with [example SLO recording rules](../../prometheus_rules), and Grafana instance with loaded [SLO dashboards](../../grafana_dashboards) -Example with docker compose which runs some dummy app, slo-exporter, -Prometheus and Grafana to see the whole setup. +Description of the whole setup follows: +- **Nginx configured with the following paths:** + - `nginx:8080/` -> `HTTP 200`, all ok + - `nginx:8080/err` -> `HTTP 500`, availability violation + - `nginx:8080/drop`-> `limit 1r/m`, latency violation +- **Slo-exporter configured to tail the nginx's logs** +- **Prometheus** + - configured to scrape the slo-exporter's metrics + - loaded with necessary recording-rules for SLO computation +- **Grafana** + - with Prometheus preconfigured as a datasource + - loaded with [SLO dashboards](../../grafana_dashboards/) +- **Slo-event-generator** + - infinite loop accessing the nginx instance to generate slo-events. + +### How to run it +``` +docker-compose up +``` + +To access Grafana and Prometheus: +``` +# http://localhost:9090 Prometheus +# http://localhost:9000 Grafana +# User: admin +# Password: admin +``` \ No newline at end of file diff --git a/examples/all_in_one/docker-compose.yaml b/examples/all_in_one/docker-compose.yaml new file mode 100644 index 0000000..9b5d2ca --- /dev/null +++ b/examples/all_in_one/docker-compose.yaml @@ -0,0 +1,70 @@ +version: '3' + +services: + nginx: + image: nginx + volumes: + - "./nginx/conf/nginx.conf:/etc/nginx/nginx.conf:ro" + - "./nginx/static:/nginx/static:ro" + - "nginx-logs:/nginx/logs/" + + slo-exporter: + image: seznam/slo-exporter:6.1.0 + depends_on: + - nginx + ports: + - 8001:8001 + working_dir: /slo-exporter + command: + - "--config-file=/slo-exporter/conf/slo_exporter.yaml" + volumes: + - ./slo-exporter/conf:/slo-exporter/conf/ + - nginx-logs:/logs/ + + slo-event-generator: + image: nginx + entrypoint: /bin/bash + command: -c 'while true; do + for i in `seq 20`; do curl -s http://nginx:8080/ >/dev/null 2>&1 ; done; + for i in `seq $$(($$RANDOM % 3))`; do curl -s http://nginx:8080/err >/dev/null 2>&1 ; done; + curl -m 1 -s http://nginx:8080/drop >/dev/null 2>&1 >/dev/null || true; + echo -n "."; + sleep 5; + done' + + prometheus: + image: prom/prometheus:latest + depends_on: + - slo-exporter + ports: + - 9090:9090 + environment: + PROMETHEUS_CONFIG: | + { + "scrape_configs":[{ + "job_name": "slo-exporter", + "scrape_interval": "2s", + "static_configs":[ + {"targets":["slo-exporter:8001"]}, + ], + }], + "rule_files": ["/prometheus/recording_rules/*yaml", "/prometheus/recording_rules/slo/*yaml"] + } + entrypoint: ["sh"] + command: + - "-c" + - 'echo $$PROMETHEUS_CONFIG > /etc/prometheus/prometheus.yml; prometheus --config.file=/etc/prometheus/prometheus.yml' + volumes: + - ./prometheus/recording_rules:/prometheus/recording_rules + + grafana: + image: grafana/grafana + depends_on: + - prometheus + ports: + - 3000:3000 + volumes: + - ./grafana/provisioning/:/etc/grafana/provisioning/ + +volumes: + nginx-logs: diff --git a/examples/all_in_one/grafana/provisioning/dashboards/SLO_detailed.json b/examples/all_in_one/grafana/provisioning/dashboards/SLO_detailed.json new file mode 100644 index 0000000..9da58b1 --- /dev/null +++ b/examples/all_in_one/grafana/provisioning/dashboards/SLO_detailed.json @@ -0,0 +1,1321 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.0" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1593202166365, + "links": [ + { + "icon": "external link", + "includeVars": false, + "tags": [ + "SRE", + "SLO", + "endpoints error-rate" + ], + "targetBlank": true, + "type": "dashboards" + }, + { + "icon": "external link", + "tags": [ + "SRE", + "SLO", + "endpoints-distribution" + ], + "targetBlank": true, + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 256, + "panels": [], + "repeat": "slo_type", + "title": "$slo_type", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.995 + }, + { + "color": "#299c46", + "value": 0.999 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 52, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "fieldOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "values": false + } + }, + "pluginVersion": "7.0.0", + "repeat": "slo_class", + "repeatDirection": "h", + "targets": [ + { + "expr": "1 - slo:violation_ratio{slo_domain=\"$slo_domain\", slo_type=\"$slo_type\", slo_class=\"$slo_class\", slo_version=\"$slo_version\", slo_time_range=\"$slo_time_range\", namespace=\"$namespace\"}", + "hide": false, + "instant": true, + "refId": "A" + } + ], + "title": "$slo_type $slo_class", + "type": "stat" + }, + { + "collapsed": false, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 146, + "panels": [], + "title": "Error budget", + "type": "row" + }, + { + "aliasColors": { + "Error budget empty": "#890f02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 144, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "paceLength": 10, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "repeat": "slo_class", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "placeholder", + "lines": false + }, + { + "alias": "zero", + "lines": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "0", + "hide": false, + "legendFormat": "zero", + "refId": "B" + }, + { + "expr": "slo:violation_ratio{slo_domain=\"$slo_domain\", slo_type=~\"$slo_type\", slo_class=\"$slo_class\", slo_version=\"$slo_version\", slo_time_range=\"$slo_time_range\", namespace=\"$namespace\"}\n/ on (slo_class,slo_domain,slo_version, slo_type, namespace) group_left ()\n(\n slo:violation_ratio_threshold - 1\n)\n+1", + "hide": false, + "interval": "", + "legendFormat": "{{ slo_type }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "error budget for $slo_class slo class", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 163, + "panels": [ + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 14 + }, + "id": 180, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "fieldOptions": { + "calcs": [ + "mean" + ] + }, + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "values": false + } + }, + "pluginVersion": "7.0.0", + "repeat": "slo_class", + "repeatDirection": "h", + "scopedVars": { + "slo_class": { + "selected": false, + "text": "critical", + "value": "critical" + }, + "slo_type": { + "selected": false, + "text": "availability", + "value": "availability" + } + }, + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$slo_type\", namespace=\"$namespace\"}", + "format": "time_series", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "$slo_type $slo_class", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 6, + "y": 14 + }, + "id": 339, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "fieldOptions": { + "calcs": [ + "mean" + ] + }, + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "values": false + } + }, + "pluginVersion": "7.0.0", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1591090840059, + "repeatPanelId": 180, + "scopedVars": { + "slo_class": { + "selected": false, + "text": "high_fast", + "value": "high_fast" + }, + "slo_type": { + "selected": false, + "text": "availability", + "value": "availability" + } + }, + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$slo_type\", namespace=\"$namespace\"}", + "format": "time_series", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "$slo_type $slo_class", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 14 + }, + "id": 340, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "fieldOptions": { + "calcs": [ + "mean" + ] + }, + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "values": false + } + }, + "pluginVersion": "7.0.0", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1591090840059, + "repeatPanelId": 180, + "scopedVars": { + "slo_class": { + "selected": false, + "text": "high_slow", + "value": "high_slow" + }, + "slo_type": { + "selected": false, + "text": "availability", + "value": "availability" + } + }, + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$slo_type\", namespace=\"$namespace\"}", + "format": "time_series", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "$slo_type $slo_class", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 14 + }, + "id": 341, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "fieldOptions": { + "calcs": [ + "mean" + ] + }, + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "values": false + } + }, + "pluginVersion": "7.0.0", + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1591090840059, + "repeatPanelId": 180, + "scopedVars": { + "slo_class": { + "selected": false, + "text": "low", + "value": "low" + }, + "slo_type": { + "selected": false, + "text": "availability", + "value": "availability" + } + }, + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$slo_type\", namespace=\"$namespace\"}", + "format": "time_series", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "$slo_type $slo_class", + "type": "stat" + } + ], + "repeat": "slo_type", + "title": "$slo_type thresholds (as %)", + "type": "row" + }, + { + "collapsed": true, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 301, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 29 + }, + "id": 199, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.6.1", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "repeat": "slo_class", + "repeatDirection": "h", + "scopedVars": { + "latency_slo_type": { + "selected": false, + "text": "latency90", + "value": "latency90" + }, + "slo_class": { + "selected": false, + "text": "critical", + "value": "critical" + } + }, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "le", + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$latency_slo_type\", namespace=\"$namespace\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "$latency_slo_type percentile threshold for $slo_class", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 6, + "y": 29 + }, + "id": 355, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.6.1", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1591090840059, + "repeatPanelId": 199, + "scopedVars": { + "latency_slo_type": { + "selected": false, + "text": "latency90", + "value": "latency90" + }, + "slo_class": { + "selected": false, + "text": "high_fast", + "value": "high_fast" + } + }, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "le", + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$latency_slo_type\", namespace=\"$namespace\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "$latency_slo_type percentile threshold for $slo_class", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 29 + }, + "id": 356, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.6.1", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1591090840059, + "repeatPanelId": 199, + "scopedVars": { + "latency_slo_type": { + "selected": false, + "text": "latency90", + "value": "latency90" + }, + "slo_class": { + "selected": false, + "text": "high_slow", + "value": "high_slow" + } + }, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "le", + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$latency_slo_type\", namespace=\"$namespace\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "$latency_slo_type percentile threshold for $slo_class", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 29 + }, + "id": 357, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.6.1", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "repeat": null, + "repeatDirection": "h", + "repeatIteration": 1591090840059, + "repeatPanelId": 199, + "scopedVars": { + "latency_slo_type": { + "selected": false, + "text": "latency90", + "value": "latency90" + }, + "slo_class": { + "selected": false, + "text": "low", + "value": "low" + } + }, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "le", + "targets": [ + { + "expr": "slo:violation_ratio_threshold{slo_class=\"$slo_class\",slo_domain=\"$slo_domain\",slo_version=\"$slo_version\", slo_type=\"$latency_slo_type\", namespace=\"$namespace\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "$latency_slo_type percentile threshold for $slo_class", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "current" + } + ], + "repeat": "latency_slo_type", + "title": "$latency_slo_type thresholds (as duration)", + "type": "row" + } + ], + "refresh": "5s", + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "datasource": "Prometheus", + "definition": "query_result(slo:stable_version{enabled!=\"false\"})", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_version", + "options": [], + "query": "query_result(slo:stable_version{enabled!=\"false\"})", + "refresh": 2, + "regex": "/slo_version=\"([^\"]+)\"/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\"}, slo_domain)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_domain", + "options": [], + "query": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\"}, slo_domain)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "query_result(slo:violation_ratio_threshold{slo_version=~\"$slo_version\", slo_domain=~\"$slo_domain\"})", + "hide": 0, + "includeAll": true, + "label": "", + "multi": true, + "name": "slo_class", + "options": [], + "query": "query_result(slo:violation_ratio_threshold{slo_version=~\"$slo_version\", slo_domain=~\"$slo_domain\"})", + "refresh": 2, + "regex": "/slo_class=\"([^\"]+)\"/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "5m", + "value": "5m" + }, + "datasource": "Prometheus", + "definition": "label_values(slo_time_range)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_time_range", + "options": [], + "query": "label_values(slo_time_range)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "Prometheus", + "definition": "query_result(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\"})", + "hide": 0, + "includeAll": true, + "label": "", + "multi": true, + "name": "slo_type", + "options": [], + "query": "query_result(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\"})", + "refresh": 2, + "regex": "/slo_type=\"([^\"]+)\"/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "Prometheus", + "definition": "query_result(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\", slo_type=~\"$slo_type\"})", + "hide": 2, + "includeAll": true, + "label": "", + "multi": true, + "name": "latency_slo_type", + "options": [], + "query": "query_result(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\", slo_type=~\"$slo_type\"})", + "refresh": 2, + "regex": "/slo_type=\"(latency[^\"]*)\"/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "query_result(slo:violation_ratio_threshold{slo_version=~\"$slo_version\", slo_domain=~\"$slo_domain\"})", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [], + "query": "query_result(slo:violation_ratio_threshold{slo_version=~\"$slo_version\", slo_domain=~\"$slo_domain\"})", + "refresh": 2, + "regex": "/namespace=\"([^\"]+)\"/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-10m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "SLO detailed", + "uid": "lRKeWGZGk", + "version": 3 +} diff --git a/examples/all_in_one/grafana/provisioning/dashboards/SLO_domains_overview.json b/examples/all_in_one/grafana/provisioning/dashboards/SLO_domains_overview.json new file mode 100644 index 0000000..1a761c1 --- /dev/null +++ b/examples/all_in_one/grafana/provisioning/dashboards/SLO_domains_overview.json @@ -0,0 +1,235 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "refresh": "5s", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1593416802079, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "datasource": "Prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "nullValueMode": "connected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#bf1b00", + "value": null + }, + { + "color": "#e5ac0e", + "value": 0 + }, + { + "color": "#299c46", + "value": 0.2 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 13, + "interval": "", + "links": [ + { + "title": " ", + "url": " " + } + ], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "fieldOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "graphMode": "area", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "values": false + } + }, + "pluginVersion": "7.0.0", + "repeat": "slo_domain", + "repeatDirection": "h", + "targets": [ + { + "expr": "min(\n slo:violation_ratio{slo_domain=\"$slo_domain\", slo_time_range=\"$slo_time_range\", namespace=\"$namespace\"}\n * on (slo_domain,slo_version, namespace) group_left() slo:stable_version{slo_domain=\"$slo_domain\"}\n / on (slo_class,slo_domain,slo_version, slo_type, namespace) group_left ()\n (\n slo:violation_ratio_threshold - 1\n)\n +1\n)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "title": "$slo_domain error budget", + "type": "stat" + } + ], + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:stable_version{},namespace)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(slo:stable_version{},namespace)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:stable_version{namespace=~\"$namespace\"},slo_domain)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "slo_domain", + "options": [], + "query": "label_values(slo:stable_version{namespace=~\"$namespace\"},slo_domain)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:violation_ratio{namespace=~\"$namespace\", slo_domain=~\"$slo_domain\"},slo_time_range)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_time_range", + "options": [], + "query": "label_values(slo:violation_ratio{namespace=~\"$namespace\", slo_domain=~\"$slo_domain\"},slo_time_range)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "SLO domains overview", + "version": 92 +} diff --git a/examples/all_in_one/grafana/provisioning/dashboards/SLO_drilldown.json b/examples/all_in_one/grafana/provisioning/dashboards/SLO_drilldown.json new file mode 100644 index 0000000..35de777 --- /dev/null +++ b/examples/all_in_one/grafana/provisioning/dashboards/SLO_drilldown.json @@ -0,0 +1,657 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.4" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table-old", + "name": "Table (old)", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1593713520364, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 8, + "panels": [], + "title": "Error Budget", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/traffic locality/", + "color": "rgba(44, 42, 42, 0.24)", + "fill": 6, + "yaxis": 2, + "zindex": 0 + }, + { + "alias": "traffic locality nagano", + "transform": "negative-Y" + }, + { + "alias": "/disabled locality/", + "color": "rgb(124, 124, 124)", + "fill": 7, + "zindex": -3 + }, + { + "alias": "/error budget/", + "zindex": 3 + }, + { + "alias": "/error budget.*offset/", + "dashes": true, + "spaceLength": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slo:violation_ratio{slo_time_range=\"$slo_time_range\", slo_version=\"$slo_version\", slo_domain=~\"$slo_domain\", slo_type=~\"$slo_type\", slo_class=~\"$slo_class\", namespace=\"$namespace\"} \n/ on (slo_domain, slo_class, slo_version, slo_type) group_left ()\n(slo:violation_ratio_threshold - 1)\n+1", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "error budget {{slo_domain}} {{slo_type}} {{slo_class}} {{slo_type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "$slo_type error budget ($slo_time_range)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": "50", + "min": "-50", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 16, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/traffic locality/", + "color": "rgba(44, 42, 42, 0.24)", + "fill": 6, + "yaxis": 2, + "zindex": 0 + }, + { + "alias": "traffic locality nagano", + "transform": "negative-Y" + }, + { + "alias": "/disabled locality/", + "color": "rgb(124, 124, 124)", + "fill": 7, + "zindex": -3 + }, + { + "alias": "/error budget/", + "zindex": 3 + }, + { + "alias": "/error budget.*offset/", + "dashes": true, + "spaceLength": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slo:violation_ratio{slo_time_range=\"$slo_time_range\", slo_version=\"$slo_version\", slo_domain=~\"$slo_domain\", slo_type=~\"$slo_type\", slo_class=~\"$slo_class\", namespace=\"$namespace\"} offset $slo_time_range\n/ on (slo_domain, slo_class, slo_version, slo_type) group_left ()\n(slo:violation_ratio_threshold - 1)\n+1", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "error budget {{slo_domain}} {{slo_type}} {{slo_class}} {{slo_type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "$slo_type error budget ($slo_time_range, offset $slo_time_range)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": "50", + "min": "-50", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Error budget (w offset)", + "type": "row" + }, + { + "collapsed": false, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 10, + "panels": [], + "title": "Error budget burned on selected time window (by app)", + "type": "row" + }, + { + "columns": [ + { + "text": "Current", + "value": "current" + } + ], + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "100%", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 4, + "pageSize": null, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "alias": "", + "align": "left", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 3, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "percentunit" + } + ], + "targets": [ + { + "expr": "(\n (\n ( # violation ratio at the beginning of the dashboard chosen time range\n clamp_min(\n sum(\n increase(slo_domain_slo_class_slo_app:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\", result=\"fail\"}[$slo_time_range] offset ${__range_s}s)\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace, slo_app)\n / on(slo_domain, slo_version, slo_class) group_left()\n sum(\n increase(slo_domain_slo_class:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\"}[$slo_time_range] offset ${__range_s}s)\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace)\n , 0)\n )\n / on (slo_domain, slo_class, slo_version, slo_type) group_left ()\n (slo:violation_ratio_threshold - 1)\n # this gets us % (e.g. -0.05 for 5 percent) of error budget burnt by each individual app at the beginning of the time range\n ) * -1\n -\n (\n ( # violation ratio valid at the end of the dashboard chosen time range\n clamp_min(\n sum(\n increase(slo_domain_slo_class_slo_app:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\", result=\"fail\"}[$slo_time_range])\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace, slo_app)\n / on(slo_domain, slo_version, slo_class) group_left()\n sum(\n increase(slo_domain_slo_class:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\"}[$slo_time_range])\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace)\n , 0)\n )\n / on (slo_domain, slo_class, slo_version, slo_type) group_left ()\n (slo:violation_ratio_threshold - 1)\n # this gets us % (e.g. -0.05 for 5 percent) of error budget burnt by each individual app at the end of the displayed time range\n ) * -1\n)\n", + "instant": true, + "interval": "", + "legendFormat": "{{ slo_app }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "", + "transform": "timeseries_aggregations", + "type": "table-old" + }, + { + "collapsed": false, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 12, + "panels": [], + "title": "Error budget burned on selected time window (by app:event_key)", + "type": "row" + }, + { + "columns": [ + { + "text": "Current", + "value": "current" + } + ], + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "100%", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 6, + "pageSize": null, + "showHeader": true, + "sort": { + "col": 1, + "desc": false + }, + "styles": [ + { + "alias": "", + "align": "left", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 3, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "percentunit" + } + ], + "targets": [ + { + "expr": "(\n (\n ( # violation ratio at the beginning of the dashboard chosen time range\n clamp_min(\n sum(\n increase(slo_domain_slo_class_slo_app_event_key:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\", result=\"fail\"}[$slo_time_range] offset ${__range_s}s)\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace, slo_app, event_key)\n / on(slo_domain, slo_version, slo_class) group_left()\n sum(\n increase(slo_domain_slo_class:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\"}[$slo_time_range] offset ${__range_s}s)\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace)\n , 0)\n )\n / on (slo_domain, slo_class, slo_version, slo_type) group_left ()\n (slo:violation_ratio_threshold - 1)\n # this gets us % (e.g. -0.05 for 5 percent) of error budget burnt by each individual app at the beginning of the time range\n ) * -1\n -\n (\n ( # violation ratio valid at the end of the dashboard chosen time range\n clamp_min(\n sum(\n increase(slo_domain_slo_class_slo_app_event_key:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\", result=\"fail\"}[$slo_time_range])\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace, slo_app, event_key)\n / on(slo_domain, slo_version, slo_class) group_left()\n sum(\n increase(slo_domain_slo_class:slo_events_total{slo_type=\"$slo_type\", slo_domain=\"$slo_domain\", slo_class=\"$slo_class\", namespace=\"$namespace\"}[$slo_time_range])\n ) by (slo_class, slo_domain, slo_version, slo_type, namespace)\n , 0)\n )\n / on (slo_domain, slo_class, slo_version, slo_type) group_left ()\n (slo:violation_ratio_threshold - 1)\n # this gets us % (e.g. -0.05 for 5 percent) of error budget burnt by each individual app at the end of the displayed time range\n ) * -1\n)\n", + "instant": true, + "interval": "", + "legendFormat": "{{ slo_app }}:{{ event_key }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "", + "transform": "timeseries_aggregations", + "type": "table-old" + } + ], + "refresh": "5s", + "schemaVersion": 25, + "style": "dark", + "tags": [ + "SLO, SRE" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "query_result(slo:stable_version{enabled!=\"false\"})", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_version", + "options": [], + "query": "query_result(slo:stable_version{enabled!=\"false\"})", + "refresh": 2, + "regex": "/slo_version=\"([^\"]+)\"/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\"}, slo_domain)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_domain", + "options": [], + "query": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\"}, slo_domain)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\"}, slo_class)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_class", + "options": [], + "query": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\"}, slo_class)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:burn_rate{slo_version=\"$slo_version\"}, slo_time_range)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_time_range", + "options": [], + "query": "label_values(slo:burn_rate{slo_version=\"$slo_version\"}, slo_time_range)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "Prometheus", + "definition": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\"}, slo_type)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "slo_type", + "options": [], + "query": "label_values(slo:violation_ratio_threshold{slo_version=\"$slo_version\", slo_domain=\"$slo_domain\"}, slo_type)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "Prometheus", + "definition": "query_result(slo:violation_ratio_threshold{slo_version=~\"$slo_version\", slo_domain=~\"$slo_domain\"})", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [], + "query": "query_result(slo:violation_ratio_threshold{slo_version=~\"$slo_version\", slo_domain=~\"$slo_domain\"})", + "refresh": 2, + "regex": "/namespace=\"([^\"]+)\"/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "2020-07-02T18:09:25.797Z", + "to": "2020-07-02T18:13:57.967Z" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "SLO drilldown", + "uid": "n0JAZFGGk", + "version": 1 +} \ No newline at end of file diff --git a/examples/all_in_one/grafana/provisioning/dashboards/dashboard.yml b/examples/all_in_one/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 0000000..25e2e63 --- /dev/null +++ b/examples/all_in_one/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: +- name: 'Prometheus' + orgId: 1 + folder: 'SLO' + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/examples/all_in_one/grafana/provisioning/datasources/datasource.yml b/examples/all_in_one/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000..1ae3ebd --- /dev/null +++ b/examples/all_in_one/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,23 @@ +# config file version +apiVersion: 1 + +# list of datasources that should be deleted from the database +deleteDatasources: + - name: Prometheus + orgId: 1 + +# list of datasources to insert/update depending +# what's available in the database +datasources: + # name of the datasource. Required + - name: Prometheus + # datasource type. Required + type: prometheus + # access mode. proxy or direct (Server or Browser in the UI). Required + access: proxy + # org id. will default to orgId 1 if not specified + orgId: 1 + # custom UID which can be used to reference this datasource in other parts of the configuration, if not specified will be generated automatically + uid: my_unique_uid + # url + url: http://prometheus:9090 diff --git a/examples/all_in_one/nginx/conf/nginx.conf b/examples/all_in_one/nginx/conf/nginx.conf new file mode 100644 index 0000000..a533537 --- /dev/null +++ b/examples/all_in_one/nginx/conf/nginx.conf @@ -0,0 +1,37 @@ +events { + worker_connections 1024; +} + +http { + server_tokens off; + include mime.types; + charset utf-8; + + log_format upstream_time '$remote_addr - $remote_user [$time_local] ' + '"$request" $status $body_bytes_sent ' + '"$http_referer" "$http_user_agent" ' + 'rt=$request_time uct="$upstream_connect_time" uht="$upstream_header_time" urt="$upstream_response_time"'; + + access_log /nginx/logs/access_log upstream_time; + + limit_req_zone $binary_remote_addr zone=one:10m rate=1r/m; + + server { + server_name localhost; + listen 0.0.0.0:8080; + + set $content_class static; + location / { + return 200; + } + + location /err { + return 500; + } + + location /drop { + # delay incoming requests so that the client will timeout + limit_req zone=one burst=5; + } + } +} diff --git a/examples/all_in_one/prometheus/recording_rules b/examples/all_in_one/prometheus/recording_rules new file mode 120000 index 0000000..e901ca2 --- /dev/null +++ b/examples/all_in_one/prometheus/recording_rules @@ -0,0 +1 @@ +../../../prometheus_rules/ \ No newline at end of file diff --git a/examples/all_in_one/slo-exporter/conf/classification.csv b/examples/all_in_one/slo-exporter/conf/classification.csv new file mode 100644 index 0000000..b1abcf4 --- /dev/null +++ b/examples/all_in_one/slo-exporter/conf/classification.csv @@ -0,0 +1 @@ +example-domain,example-app,critical,"^(GET|POST|HEAD|PUT|DELETE):.*" diff --git a/examples/all_in_one/slo-exporter/conf/slo_exporter.yaml b/examples/all_in_one/slo-exporter/conf/slo_exporter.yaml new file mode 100644 index 0000000..768a9d3 --- /dev/null +++ b/examples/all_in_one/slo-exporter/conf/slo_exporter.yaml @@ -0,0 +1,48 @@ +webServerListenAddress: "0.0.0.0:8001" +maximumGracefulShutdownDuration: "10s" +afterPipelineShutdownDelay: "1s" + +pipeline: ["tailer", "relabel", "eventKeyGenerator", "dynamicClassifier", "sloEventProducer", "prometheusExporter"] + +modules: + + tailer: + tailedFile: "/logs/access_log" + follow: true + reopen: true + positionFile: "" + positionPersistenceInterval: "2s" + loglineParseRegexp: '^(?P[A-Fa-f0-9.:]{4,50}) - \S+ \[(?P