diff --git a/kubernetes/core/kube-prometheus-stack.yml b/kubernetes/core/kube-prometheus-stack.yml new file mode 100644 index 00000000..ec4c0ae3 --- /dev/null +++ b/kubernetes/core/kube-prometheus-stack.yml @@ -0,0 +1,127 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: prometheus + namespace: kube-system +type: Opaque +# stringData: +# grafana-admin-user: "someuser" +# grafana-admin-password: "somepass" + +--- +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: kube-prometheus-stack + namespace: kube-system +spec: + chart: kube-prometheus-stack + repo: https://prometheus-community.github.io/helm-charts + version: 65.8.0 + targetNamespace: kube-system + valuesContent: |- + defaultRules: + rules: + windows: false + grafana: + admin: + existingSecret: "prometheus" + userKey: grafana-admin-user + passwordKey: grafana-admin-password + ingress: + enabled: true + hosts: + - grafana.${DOMAIN_NAME} + tls: + - secretName: ${DOMAIN_NAME}-tls + hosts: + - grafana.${DOMAIN_NAME} + persistence: + enabled: true + storageClassName: "local-path-retain" + accessModes: + - ReadWriteOnce + size: 20Gi + finalizers: + - kubernetes.io/pvc-protection + affinity: + nodeAffinity: + # Schedule onto amd64 to specifically avoid raspberry pi to not wear the SD card + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/arch" + operator: In + values: + - amd64 + prometheusSpec: + retention: 365d + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: "local-path-retain" + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + affinity: + nodeAffinity: + # Schedule onto amd64 to specifically avoid raspberry pi to not wear the SD card + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/arch" + operator: In + values: + - amd64 + prometheus-windows-exporter: + prometheus: + monitor: + enabled: false + # https://github.com/prometheus-community/helm-charts/issues/2865#issuecomment-1490318965 + kubelet: + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+)" + action: keep + - sourceLabels: ["node"] + targetLabel: instance + action: replace + kubeApiServer: + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+)" + action: keep + # Remove high cardinality metrics + - sourceLabels: ["__name__"] + regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket + action: drop + - sourceLabels: ["__name__"] + regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket) + action: drop + kubeControllerManager: + endpoints: $NODES_ENDPOINTS + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|attachdetach_controller|authenticated_user|authentication|cronjob_controller|disabled_metric|endpoint_slice|ephemeral_volume|garbagecollector_controller|get_token|go|hidden_metric|job_controller|kubernetes_build|kubernetes_feature|leader_election|node_collector|node_ipam|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|pv_collector|registered_metric|replicaset_controller|rest_client|retroactive_storageclass|root_ca|running_managed|scrape_duration|scrape_samples|scrape_series|service_controller|storage_count|storage_operation|ttl_after|volume_operation|workqueue)_(.+)" + action: keep + kubeEtcd: + endpoints: $NODES_ENDPOINTS + kubeProxy: + enabled: false # Disabled because eBPF + endpoints: $NODES_ENDPOINTS + kubeScheduler: + endpoints: $NODES_ENDPOINTS + serviceMonitor: + metricRelabelings: + # Remove duplicate metrics + - sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authenticated_user|authentication|disabled_metric|go|hidden_metric|kubernetes_build|kubernetes_feature|leader_election|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scheduler|scrape_duration|scrape_samples|scrape_series|workqueue)_(.+)" + action: keep diff --git a/kubernetes/services/homer.yml b/kubernetes/services/homer.yml index 055ded65..56c42cf2 100644 --- a/kubernetes/services/homer.yml +++ b/kubernetes/services/homer.yml @@ -138,6 +138,9 @@ data: - name: "Monitoring" icon: "fas fa-heartbeat" items: + - name: "Grafana" + logo: "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a1/Grafana_logo.svg/1024px-Grafana_logo.svg.png" + url: "https://grafana.${DOMAIN_NAME}" - name: "Speedtest Tracker" logo: "https://raw.githubusercontent.com/henrywhitaker3/Speedtest-Tracker/8cb2e8a3236850b4a07e887ac376c0d4d5e804f4/public/icons/fav/ms-icon-310x310.png" url: "https://speedtest.${DOMAIN_NAME}" diff --git a/roles/kubernetes_apps/tasks/apply_folder_manifests.yml b/roles/kubernetes_apps/tasks/apply_folder_manifests.yml index c3674390..89e06a05 100644 --- a/roles/kubernetes_apps/tasks/apply_folder_manifests.yml +++ b/roles/kubernetes_apps/tasks/apply_folder_manifests.yml @@ -10,7 +10,7 @@ manifests_folder: "{{ kubernetes_project_location }}/{{ folder_name }}" - name: Generate aggregate manifest file - ansible.builtin.shell: "set -o pipefail && cat {{ manifests_folder }}/*.yml | env DOMAIN_NAME={{ fqdn }} envsubst '$DOMAIN_NAME' > {{ aggregate_path }}" + ansible.builtin.shell: "set -o pipefail && cat {{ manifests_folder }}/*.yml | env DOMAIN_NAME={{ fqdn }} NODES_ENDPOINTS='{{ k3s_nodes_endpoints.stdout }}' envsubst '$DOMAIN_NAME $NODES_ENDPOINTS' > {{ aggregate_path }}" args: executable: /usr/bin/bash delegate_to: localhost diff --git a/roles/kubernetes_apps/tasks/main.yml b/roles/kubernetes_apps/tasks/main.yml index 4966c6bf..73b7abd0 100644 --- a/roles/kubernetes_apps/tasks/main.yml +++ b/roles/kubernetes_apps/tasks/main.yml @@ -11,6 +11,18 @@ - { to_port: 30778, proto: any, rule: allow, comment: "Allow Minecraft" } when: not (skip_firewall_setup or manifest_only_setup) +- name: Fetch list of nodes + ansible.builtin.shell: "set -o pipefail && kubectl get nodes -o json | jq -c '[.items[].metadata.annotations.\"k3s.io/external-ip\"] | sort'" + args: + executable: /usr/bin/bash + when: + - k3s_control_node is defined + - k3s_control_node + run_once: true + changed_when: false + become: true + register: k3s_nodes_endpoints + - name: Deploy manifests when: - k3s_control_node is defined diff --git a/roles/system_setup/tasks/packages.yml b/roles/system_setup/tasks/packages.yml index dcad6971..81aa009d 100644 --- a/roles/system_setup/tasks/packages.yml +++ b/roles/system_setup/tasks/packages.yml @@ -11,6 +11,7 @@ - avahi-daemon - curl - ddclient + - jq - rsync - smartmontools - ufw