Skip to content

Commit

Permalink
Add prometheus stack
Browse files Browse the repository at this point in the history
Closes #16
  • Loading branch information
abelfodil committed Nov 7, 2024
1 parent 1d96c00 commit f25a8cb
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 1 deletion.
127 changes: 127 additions & 0 deletions kubernetes/core/kube-prometheus-stack.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
---
apiVersion: v1
kind: Secret
metadata:
name: prometheus
namespace: kube-system
type: Opaque
# stringData:
# grafana-admin-user: "someuser"
# grafana-admin-password: "somepass"

---
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: kube-prometheus-stack
namespace: kube-system
spec:
chart: kube-prometheus-stack
repo: https://prometheus-community.github.io/helm-charts
version: 65.8.0
targetNamespace: kube-system
valuesContent: |-
defaultRules:
rules:
windows: false
grafana:
admin:
existingSecret: "prometheus"
userKey: grafana-admin-user
passwordKey: grafana-admin-password
ingress:
enabled: true
hosts:
- grafana.${DOMAIN_NAME}
tls:
- secretName: ${DOMAIN_NAME}-tls
hosts:
- grafana.${DOMAIN_NAME}
persistence:
enabled: true
storageClassName: "local-path-retain"
accessModes:
- ReadWriteOnce
size: 20Gi
finalizers:
- kubernetes.io/pvc-protection
affinity:
nodeAffinity:
# Schedule onto amd64 to specifically avoid raspberry pi to not wear the SD card
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "kubernetes.io/arch"
operator: In
values:
- amd64
prometheusSpec:
retention: 365d
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: "local-path-retain"
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
affinity:
nodeAffinity:
# Schedule onto amd64 to specifically avoid raspberry pi to not wear the SD card
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "kubernetes.io/arch"
operator: In
values:
- amd64
prometheus-windows-exporter:
prometheus:
monitor:
enabled: false
# https://github.com/prometheus-community/helm-charts/issues/2865#issuecomment-1490318965
kubelet:
serviceMonitor:
metricRelabelings:
# Remove duplicate metrics
- sourceLabels: ["__name__"]
regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+)"
action: keep
- sourceLabels: ["node"]
targetLabel: instance
action: replace
kubeApiServer:
serviceMonitor:
metricRelabelings:
# Remove duplicate metrics
- sourceLabels: ["__name__"]
regex: "(aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+)"
action: keep
# Remove high cardinality metrics
- sourceLabels: ["__name__"]
regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket
action: drop
- sourceLabels: ["__name__"]
regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket)
action: drop
kubeControllerManager:
endpoints: $NODES_ENDPOINTS
serviceMonitor:
metricRelabelings:
# Remove duplicate metrics
- sourceLabels: ["__name__"]
regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|attachdetach_controller|authenticated_user|authentication|cronjob_controller|disabled_metric|endpoint_slice|ephemeral_volume|garbagecollector_controller|get_token|go|hidden_metric|job_controller|kubernetes_build|kubernetes_feature|leader_election|node_collector|node_ipam|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|pv_collector|registered_metric|replicaset_controller|rest_client|retroactive_storageclass|root_ca|running_managed|scrape_duration|scrape_samples|scrape_series|service_controller|storage_count|storage_operation|ttl_after|volume_operation|workqueue)_(.+)"
action: keep
kubeEtcd:
endpoints: $NODES_ENDPOINTS
kubeProxy:
enabled: false # Disabled because eBPF
endpoints: $NODES_ENDPOINTS
kubeScheduler:
endpoints: $NODES_ENDPOINTS
serviceMonitor:
metricRelabelings:
# Remove duplicate metrics
- sourceLabels: ["__name__"]
regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authenticated_user|authentication|disabled_metric|go|hidden_metric|kubernetes_build|kubernetes_feature|leader_election|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scheduler|scrape_duration|scrape_samples|scrape_series|workqueue)_(.+)"
action: keep
3 changes: 3 additions & 0 deletions kubernetes/services/homer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ data:
- name: "Monitoring"
icon: "fas fa-heartbeat"
items:
- name: "Grafana"
logo: "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a1/Grafana_logo.svg/1024px-Grafana_logo.svg.png"
url: "https://grafana.${DOMAIN_NAME}"
- name: "Speedtest Tracker"
logo: "https://raw.githubusercontent.com/henrywhitaker3/Speedtest-Tracker/8cb2e8a3236850b4a07e887ac376c0d4d5e804f4/public/icons/fav/ms-icon-310x310.png"
url: "https://speedtest.${DOMAIN_NAME}"
Expand Down
2 changes: 1 addition & 1 deletion roles/kubernetes_apps/tasks/apply_folder_manifests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
manifests_folder: "{{ kubernetes_project_location }}/{{ folder_name }}"

- name: Generate aggregate manifest file
ansible.builtin.shell: "set -o pipefail && cat {{ manifests_folder }}/*.yml | env DOMAIN_NAME={{ fqdn }} envsubst '$DOMAIN_NAME' > {{ aggregate_path }}"
ansible.builtin.shell: "set -o pipefail && cat {{ manifests_folder }}/*.yml | env DOMAIN_NAME={{ fqdn }} NODES_ENDPOINTS='{{ k3s_nodes_endpoints.stdout }}' envsubst '$DOMAIN_NAME $NODES_ENDPOINTS' > {{ aggregate_path }}"
args:
executable: /usr/bin/bash
delegate_to: localhost
Expand Down
12 changes: 12 additions & 0 deletions roles/kubernetes_apps/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
- { to_port: 30778, proto: any, rule: allow, comment: "Allow Minecraft" }
when: not (skip_firewall_setup or manifest_only_setup)

- name: Fetch list of nodes
ansible.builtin.shell: "set -o pipefail && kubectl get nodes -o json | jq -c '[.items[].metadata.annotations.\"k3s.io/external-ip\"] | sort'"
args:
executable: /usr/bin/bash
when:
- k3s_control_node is defined
- k3s_control_node
run_once: true
changed_when: false
become: true
register: k3s_nodes_endpoints

- name: Deploy manifests
when:
- k3s_control_node is defined
Expand Down
1 change: 1 addition & 0 deletions roles/system_setup/tasks/packages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- avahi-daemon
- curl
- ddclient
- jq
- rsync
- smartmontools
- ufw
Expand Down

0 comments on commit f25a8cb

Please sign in to comment.