diff --git a/kubernetes/core/templates/kube-prometheus-stack.yml b/kubernetes/core/templates/kube-prometheus-stack.yml new file mode 100644 index 0000000..2cd9fc2 --- /dev/null +++ b/kubernetes/core/templates/kube-prometheus-stack.yml @@ -0,0 +1,127 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: prometheus + namespace: kube-system +type: Opaque +# stringData: +# grafana-admin-user: "someuser" +# grafana-admin-password: "somepass" + +--- +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: kube-prometheus-stack + namespace: kube-system +spec: + chart: kube-prometheus-stack + repo: https://prometheus-community.github.io/helm-charts + version: 65.8.0 + targetNamespace: kube-system + valuesContent: |- + defaultRules: + rules: + windows: false + grafana: + admin: + existingSecret: "prometheus" + userKey: grafana-admin-user + passwordKey: grafana-admin-password + ingress: + enabled: true + hosts: + - grafana.{{ .Values.fqdn }} + tls: + - secretName: {{ .Values.fqdn }}-tls + hosts: + - grafana.{{ .Values.fqdn }} + persistence: + enabled: true + storageClassName: "local-path-retain" + accessModes: + - ReadWriteOnce + size: 20Gi + finalizers: + - kubernetes.io/pvc-protection + affinity: + nodeAffinity: + # Schedule onto amd64 to specifically avoid raspberry pi to not wear the SD card + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/arch" + operator: In + values: + - amd64 + prometheusSpec: + retention: 365d + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: "local-path-retain" + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + affinity: + nodeAffinity: + # Schedule onto amd64 to specifically avoid raspberry pi to not wear the SD card + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/arch" + operator: In + values: + - amd64 + prometheus-windows-exporter: + prometheus: + monitor: + enabled: false + # https://github.com/prometheus-community/helm-charts/issues/2865#issuecomment-1490318965 + kubelet: + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+)" + action: keep + - sourceLabels: ["node"] + targetLabel: instance + action: replace + kubeApiServer: + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+)" + action: keep + # Remove high cardinality metrics + - sourceLabels: ["__name__"] + regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket + action: drop + - sourceLabels: ["__name__"] + regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket) + action: drop + kubeControllerManager: + endpoints: {{ .Values.nodes_endpoints }} + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|attachdetach_controller|authenticated_user|authentication|cronjob_controller|disabled_metric|endpoint_slice|ephemeral_volume|garbagecollector_controller|get_token|go|hidden_metric|job_controller|kubernetes_build|kubernetes_feature|leader_election|node_collector|node_ipam|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|pv_collector|registered_metric|replicaset_controller|rest_client|retroactive_storageclass|root_ca|running_managed|scrape_duration|scrape_samples|scrape_series|service_controller|storage_count|storage_operation|ttl_after|volume_operation|workqueue)_(.+)" + action: keep + kubeEtcd: + endpoints: {{ .Values.nodes_endpoints }} + kubeProxy: + enabled: false # Disabled because eBPF + endpoints: {{ .Values.nodes_endpoints }} + kubeScheduler: + endpoints: {{ .Values.nodes_endpoints }} + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authenticated_user|authentication|disabled_metric|go|hidden_metric|kubernetes_build|kubernetes_feature|leader_election|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scheduler|scrape_duration|scrape_samples|scrape_series|workqueue)_(.+)" + action: keep diff --git a/kubernetes/core/values.yaml b/kubernetes/core/values.yaml index 5c20108..ae46c2a 100644 --- a/kubernetes/core/values.yaml +++ b/kubernetes/core/values.yaml @@ -4,3 +4,6 @@ # Base domain name to be used for all services fqdn: example.org + +# Kubernetes cluster's nodes's endpoints +nodes_endpoints: [] diff --git a/kubernetes/services/templates/homer.yml b/kubernetes/services/templates/homer.yml index cdd8b7d..d646331 100644 --- a/kubernetes/services/templates/homer.yml +++ b/kubernetes/services/templates/homer.yml @@ -145,9 +145,9 @@ data: - name: "Monitoring" icon: "fas fa-heartbeat" items: - - name: "Netdata" - logo: "https://raw.githubusercontent.com/netdata/netdata/483d8481a5a5edf72630068534feb1a4d228ed0b/web/gui/v1/images/favicon-196x196.png" - url: "https://netdata.{{ .Values.fqdn }}" + - name: "Grafana" + logo: "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a1/Grafana_logo.svg/1024px-Grafana_logo.svg.png" + url: "https://grafana.{{ .Values.fqdn }}" - name: "Speedtest Tracker" logo: "https://raw.githubusercontent.com/henrywhitaker3/Speedtest-Tracker/8cb2e8a3236850b4a07e887ac376c0d4d5e804f4/public/icons/fav/ms-icon-310x310.png" url: "https://speedtest.{{ .Values.fqdn }}" diff --git a/roles/kubernetes_apps/tasks/apply_folder_manifests.yml b/roles/kubernetes_apps/tasks/apply_folder_manifests.yml index 0ae5a8f..8923695 100644 --- a/roles/kubernetes_apps/tasks/apply_folder_manifests.yml +++ b/roles/kubernetes_apps/tasks/apply_folder_manifests.yml @@ -14,6 +14,7 @@ chart_ref: "{{ chart_folder }}" set_values: - value: "fqdn={{ fqdn }}" + - value: nodes_endpoints={{ k3s_nodes_endpoints.stdout }} delegate_to: localhost register: helm_template diff --git a/roles/kubernetes_apps/tasks/main.yml b/roles/kubernetes_apps/tasks/main.yml index 60c633d..dfba8ff 100644 --- a/roles/kubernetes_apps/tasks/main.yml +++ b/roles/kubernetes_apps/tasks/main.yml @@ -13,6 +13,18 @@ - { to_port: 30778, proto: any, rule: allow, comment: "Allow Minecraft Bedrock" } when: not (skip_firewall_setup or manifest_only_setup) +- name: Fetch list of nodes + ansible.builtin.shell: "set -o pipefail && kubectl get nodes -o json | jq -c '[.items[].metadata.annotations.\"k3s.io/external-ip\"] | sort'" + args: + executable: /usr/bin/bash + when: + - k3s_control_node is defined + - k3s_control_node + run_once: true + changed_when: false + become: true + register: k3s_nodes_endpoints + - name: Deploy manifests when: - k3s_control_node is defined diff --git a/roles/system_setup/tasks/packages.yml b/roles/system_setup/tasks/packages.yml index 57546be..04eba28 100644 --- a/roles/system_setup/tasks/packages.yml +++ b/roles/system_setup/tasks/packages.yml @@ -10,6 +10,7 @@ pkg: - curl - ddclient + - jq - rasdaemon - rsync - smartmontools