采集k8s节点上pod所使用的GPU的对应关系
访问endpoint: http://127.0.0.1:9401/metrics
# HELP pod_and_gpu_map_info Provide a mapping relationship between pod and gpu
# TYPE pod_and_gpu_map_info gauge
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test4"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default3",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2040ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb91",gpu_model="2080ti",namespace="default",node="ti",pod_name="test3"} 1.0
访问endpoint: http://127.0.0.1:9401/metrics/nodes
# HELP ti_info Provide every node's pod and gpu mapping relationship
# TYPE ti_info gauge
ti_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test"} 1.0
ti_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test2"} 1.0
# HELP ti2_info Provide every node's pod and gpu mapping relationship
# TYPE ti2_info gauge
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test4"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default3",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
# HELP ti7_info Provide every node's pod and gpu mapping relationship
# TYPE ti7_info gauge
ti7_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test2"} 1.0
ti7_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2040ti",namespace="default",node="ti7",pod_name="test2"} 1.0
ti7_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb91",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test3"} 1.0
访问endpoint:http://127.0.0.1:9401/metrics
# HELP POD_GPU_MAP_INFO_0 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_0 gauge
POD_GPU_MAP_INFO_0{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test"} 0.0
# HELP POD_GPU_MAP_INFO_1 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_1 gauge
POD_GPU_MAP_INFO_1{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_2 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_2 gauge
POD_GPU_MAP_INFO_2{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_3 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_3 gauge
POD_GPU_MAP_INFO_3{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_4 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_4 gauge
POD_GPU_MAP_INFO_4{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_5 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_5 gauge
POD_GPU_MAP_INFO_5{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test4"} 0.0
# HELP POD_GPU_MAP_INFO_6 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_6 gauge
POD_GPU_MAP_INFO_6{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_7 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_7 gauge
POD_GPU_MAP_INFO_7{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default3",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_8 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_8 gauge
POD_GPU_MAP_INFO_8{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_9 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_9 gauge
POD_GPU_MAP_INFO_9{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_10 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_10 gauge
POD_GPU_MAP_INFO_10{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2040ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_11 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_11 gauge
POD_GPU_MAP_INFO_11{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb91",gpu_model="2080ti",namespace="default",node="ti",pod_name="test3"} 0.0
https://github.com/tkestack/gpu-manager 提供metric,IP:5678/metric默认未开启 *
metric name | help | 说明 | k8s yaml def |
---|---|---|---|
container_gpu_memory_total | gpu memory usage in MiB | pod的gpu显存使用量 MiB | |
container_request_gpu_memory | request of gpu memory in MiB | pod申请的gpu显存总量 MiB |
|
container_gpu_utilization | gpu utilization | pod的gpu算力使用率: 每块卡metric值为0-100,每块卡使用率以及所有卡使用率和 | |
container_request_gpu_utilization | request of gpu utilization | pod申请的gpu算力(多少vgpu卡): 申请0.5块卡,metric值为0.5;申请2块卡,metric值为2 |
注意:一个pod申请: 0-100,或者100倍数 |