diff --git a/charts/datadog/CHANGELOG.md b/charts/datadog/CHANGELOG.md index 0f2ba4896..0d9eb781f 100644 --- a/charts/datadog/CHANGELOG.md +++ b/charts/datadog/CHANGELOG.md @@ -1,5 +1,9 @@ # Datadog changelog +## 3.91.0 + +* Add support for GPU monitoring + ## 3.90.5 * Update `fips.image.tag` to `1.1.7` updating openSSL version to 3.0.16 diff --git a/charts/datadog/Chart.yaml b/charts/datadog/Chart.yaml index ffcc41bfa..48bd90e86 100644 --- a/charts/datadog/Chart.yaml +++ b/charts/datadog/Chart.yaml @@ -1,7 +1,7 @@ --- apiVersion: v1 name: datadog -version: 3.90.5 +version: 3.91.0 appVersion: "7" description: Datadog Agent keywords: diff --git a/charts/datadog/README.md b/charts/datadog/README.md index b7be655be..eec89862a 100644 --- a/charts/datadog/README.md +++ b/charts/datadog/README.md @@ -1,6 +1,6 @@ # Datadog -![Version: 3.90.5](https://img.shields.io/badge/Version-3.90.5-informational?style=flat-square) ![AppVersion: 7](https://img.shields.io/badge/AppVersion-7-informational?style=flat-square) +![Version: 3.91.0](https://img.shields.io/badge/Version-3.91.0-informational?style=flat-square) ![AppVersion: 7](https://img.shields.io/badge/AppVersion-7-informational?style=flat-square) [Datadog](https://www.datadoghq.com/) is a hosted infrastructure monitoring platform. This chart adds the Datadog Agent to all nodes in your cluster via a DaemonSet. It also optionally depends on the [kube-state-metrics chart](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics). For more information about monitoring Kubernetes with Datadog, please refer to the [Datadog documentation website](https://docs.datadoghq.com/agent/basic_agent_usage/kubernetes/). @@ -749,6 +749,9 @@ helm install \ | datadog.envFrom | list | `[]` | Set environment variables for all Agents directly from configMaps and/or secrets | | datadog.excludePauseContainer | bool | `true` | Exclude pause containers from Agent Autodiscovery. | | datadog.expvarPort | int | `6000` | Specify the port to expose pprof and expvar to not interfere with the agent metrics port from the cluster-agent, which defaults to 5000 | +| datadog.gpuMonitoring.configureCgroupPerms | bool | `false` | Configure cgroup permissions for GPU monitoring | +| datadog.gpuMonitoring.enabled | bool | `false` | Enable GPU monitoring | +| datadog.gpuMonitoring.runtimeClassName | string | `"nvidia"` | Runtime class name for the agent pods to get access to NVIDIA resources | | datadog.helmCheck.collectEvents | bool | `false` | Set this to true to enable event collection in the Helm Check (Requires Agent 7.36.0+ and Cluster Agent 1.20.0+) This requires datadog.HelmCheck.enabled to be set to true | | datadog.helmCheck.enabled | bool | `false` | Set this to true to enable the Helm check (Requires Agent 7.35.0+ and Cluster Agent 1.19.0+) This requires clusterAgent.enabled to be set to true | | datadog.helmCheck.valuesAsTags | object | `{}` | Collects Helm values from a release and uses them as tags (Requires Agent and Cluster Agent 7.40.0+). This requires datadog.HelmCheck.enabled to be set to true | diff --git a/charts/datadog/templates/_container-system-probe.yaml b/charts/datadog/templates/_container-system-probe.yaml index 6e3127392..8b6669b18 100644 --- a/charts/datadog/templates/_container-system-probe.yaml +++ b/charts/datadog/templates/_container-system-probe.yaml @@ -21,7 +21,7 @@ {{- include "containers-common-env" . | nindent 4 }} - name: DD_LOG_LEVEL value: {{ .Values.agents.containers.systemProbe.logLevel | default .Values.datadog.logLevel | quote }} - {{- if .Values.datadog.serviceMonitoring.enabled }} + {{- if or .Values.datadog.serviceMonitoring.enabled .Values.datadog.gpuMonitoring.enabled }} - name: HOST_ROOT value: "/host/root" {{- end }} @@ -70,14 +70,14 @@ mountPath: /host/proc mountPropagation: {{ .Values.datadog.hostVolumeMountPropagation }} readOnly: true -{{- if or .Values.datadog.serviceMonitoring.enabled .Values.datadog.networkMonitoring.enabled .Values.datadog.discovery.enabled }} +{{- if or .Values.datadog.serviceMonitoring.enabled .Values.datadog.networkMonitoring.enabled .Values.datadog.discovery.enabled .Values.datadog.gpuMonitoring.enabled }} - name: cgroups mountPath: /host/sys/fs/cgroup mountPropagation: {{ .Values.datadog.hostVolumeMountPropagation }} readOnly: true {{- end }} {{- include "linux-container-host-release-volumemounts" . | nindent 4 }} - {{- if (eq (include "should-add-host-path-for-os-release-paths" .) "true") }} + {{- if (eq (include "should-add-host-path-for-os-release-paths" .) "true") }} {{- if ne .Values.datadog.osReleasePath "/etc/redhat-release" }} - name: etc-redhat-release mountPath: /host/etc/redhat-release @@ -94,12 +94,16 @@ readOnly: true {{- end }} {{- end }} -{{- if .Values.datadog.serviceMonitoring.enabled }} +{{- if or .Values.datadog.serviceMonitoring.enabled .Values.datadog.gpuMonitoring.enabled }} - name: hostroot mountPath: /host/root mountPropagation: {{ .Values.datadog.hostVolumeMountPropagation }} readOnly: true {{- end }} +{{- if .Values.datadog.gpuMonitoring.enabled }} + - name: gpu-devices + mountPath: /var/run/nvidia-container-devices/all +{{- end }} {{- if and (eq (include "runtime-compilation-enabled" .) "true") .Values.datadog.systemProbe.enableDefaultKernelHeadersPaths }} - name: modules mountPath: /lib/modules diff --git a/charts/datadog/templates/_daemonset-volumes-linux.yaml b/charts/datadog/templates/_daemonset-volumes-linux.yaml index 136e2c6af..c4238986c 100644 --- a/charts/datadog/templates/_daemonset-volumes-linux.yaml +++ b/charts/datadog/templates/_daemonset-volumes-linux.yaml @@ -148,7 +148,7 @@ path: /etc/passwd name: passwd {{- end }} -{{- if or (and (eq (include "should-enable-system-probe" .) "true") .Values.datadog.serviceMonitoring.enabled) (and (eq (include "should-enable-security-agent" .) "true") .Values.datadog.securityAgent.compliance.enabled) }} +{{- if or (and (eq (include "should-enable-system-probe" .) "true") (or .Values.datadog.serviceMonitoring.enabled .Values.datadog.gpuMonitoring.enabled)) (and (eq (include "should-enable-security-agent" .) "true") .Values.datadog.securityAgent.compliance.enabled) }} - hostPath: path: / name: hostroot @@ -219,4 +219,9 @@ secretName: datadog-kubelet-cert name: kubelet-cert-volume {{- end }} +{{- if .Values.datadog.gpuMonitoring.enabled }} +- name: gpu-devices + hostPath: + path: /dev/null +{{- end }} {{- end -}} diff --git a/charts/datadog/templates/_helpers.tpl b/charts/datadog/templates/_helpers.tpl index 7d07df3cd..59edaf668 100644 --- a/charts/datadog/templates/_helpers.tpl +++ b/charts/datadog/templates/_helpers.tpl @@ -329,7 +329,7 @@ Return a remote image path based on `.Values` (passed as root) and `.` (any `.im Return true if a system-probe feature is enabled. */}} {{- define "system-probe-feature" -}} -{{- if or .Values.datadog.securityAgent.runtime.enabled .Values.datadog.securityAgent.runtime.fimEnabled .Values.datadog.networkMonitoring.enabled .Values.datadog.systemProbe.enableTCPQueueLength .Values.datadog.systemProbe.enableOOMKill .Values.datadog.serviceMonitoring.enabled .Values.datadog.discovery.enabled -}} +{{- if or .Values.datadog.securityAgent.runtime.enabled .Values.datadog.securityAgent.runtime.fimEnabled .Values.datadog.networkMonitoring.enabled .Values.datadog.systemProbe.enableTCPQueueLength .Values.datadog.systemProbe.enableOOMKill .Values.datadog.serviceMonitoring.enabled .Values.datadog.discovery.enabled .Values.datadog.gpuMonitoring.enabled -}} true {{- else -}} false diff --git a/charts/datadog/templates/daemonset.yaml b/charts/datadog/templates/daemonset.yaml index 45dc64663..3eb021cba 100644 --- a/charts/datadog/templates/daemonset.yaml +++ b/charts/datadog/templates/daemonset.yaml @@ -114,6 +114,9 @@ spec: {{- if or .Values.agents.priorityClassCreate .Values.agents.priorityClassName }} priorityClassName: {{ .Values.agents.priorityClassName | default (include "datadog.fullname" . ) }} {{- end }} + {{- if .Values.datadog.gpuMonitoring.enabled }} + runtimeClassName: {{ .Values.datadog.gpuMonitoring.runtimeClassName }} + {{- end }} containers: {{- include "container-agent" . | nindent 6 }} {{- if eq (include "should-enable-trace-agent" .) "true" }} diff --git a/charts/datadog/templates/system-probe-configmap.yaml b/charts/datadog/templates/system-probe-configmap.yaml index d769235d2..e74f96f23 100644 --- a/charts/datadog/templates/system-probe-configmap.yaml +++ b/charts/datadog/templates/system-probe-configmap.yaml @@ -47,6 +47,9 @@ data: discovery: enabled: {{ $.Values.datadog.discovery.enabled }} {{- end }} + gpu_monitoring: + enabled: {{ $.Values.datadog.gpuMonitoring.enabled }} + configure_cgroup_perms: {{ $.Values.datadog.gpuMonitoring.configureCgroupPerms }} runtime_security_config: enabled: {{ $.Values.datadog.securityAgent.runtime.enabled }} fim_enabled: {{ $.Values.datadog.securityAgent.runtime.fimEnabled }} diff --git a/charts/datadog/values.yaml b/charts/datadog/values.yaml index 858702591..126c59758 100644 --- a/charts/datadog/values.yaml +++ b/charts/datadog/values.yaml @@ -835,6 +835,17 @@ datadog: # datadog.discovery.enabled -- (bool) Enable Service Discovery enabled: # false + gpuMonitoring: + # datadog.gpuMonitoring.enabled -- Enable GPU monitoring + enabled: false + + # datadog.gpuMonitoring.configureCgroupPerms -- Configure cgroup permissions for GPU monitoring + configureCgroupPerms: false + + # datadog.gpuMonitoring.runtimeClassName -- Runtime class name for the agent pods to get access to NVIDIA resources + runtimeClassName: "nvidia" + + # Software Bill of Materials configuration sbom: containerImage: