Skip to content

nineep/pod-gpu-exporter

Repository files navigation

pod gpu metrics exporter

采集k8s节点上pod所使用的GPU的对应关系

支持的metrics

# HELP pod_and_gpu_map_info Provide a mapping relationship between pod and gpu
# TYPE pod_and_gpu_map_info gauge
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test4"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default3",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2040ti",namespace="default",node="ti2",pod_name="test2"} 1.0
pod_and_gpu_map_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb91",gpu_model="2080ti",namespace="default",node="ti",pod_name="test3"} 1.0
# HELP ti_info Provide every node's pod and gpu mapping relationship
# TYPE ti_info gauge
ti_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test"} 1.0
ti_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test2"} 1.0
# HELP ti2_info Provide every node's pod and gpu mapping relationship
# TYPE ti2_info gauge
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test4"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default3",node="ti2",pod_name="test2"} 1.0
ti2_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 1.0
# HELP ti7_info Provide every node's pod and gpu mapping relationship
# TYPE ti7_info gauge
ti7_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test2"} 1.0
ti7_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2040ti",namespace="default",node="ti7",pod_name="test2"} 1.0
ti7_info{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb91",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test3"} 1.0

遗弃的metrics

# HELP POD_GPU_MAP_INFO_0 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_0 gauge
POD_GPU_MAP_INFO_0{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti",pod_name="test"} 0.0
# HELP POD_GPU_MAP_INFO_1 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_1 gauge
POD_GPU_MAP_INFO_1{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_2 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_2 gauge
POD_GPU_MAP_INFO_2{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_3 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_3 gauge
POD_GPU_MAP_INFO_3{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="4080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_4 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_4 gauge
POD_GPU_MAP_INFO_4{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_5 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_5 gauge
POD_GPU_MAP_INFO_5{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test4"} 0.0
# HELP POD_GPU_MAP_INFO_6 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_6 gauge
POD_GPU_MAP_INFO_6{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_7 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_7 gauge
POD_GPU_MAP_INFO_7{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default3",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_8 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_8 gauge
POD_GPU_MAP_INFO_8{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_9 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_9 gauge
POD_GPU_MAP_INFO_9{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2080ti",namespace="default",node="ti7",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_10 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_10 gauge
POD_GPU_MAP_INFO_10{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb61",gpu_model="2040ti",namespace="default",node="ti2",pod_name="test2"} 0.0
# HELP POD_GPU_MAP_INFO_11 pod gpu map info - please use labels value to match pod and gpu
# TYPE POD_GPU_MAP_INFO_11 gauge
POD_GPU_MAP_INFO_11{gpu_id="GPU-40f717b2-01dc-1a31-6b17-d487ffe7cb91",gpu_model="2080ti",namespace="default",node="ti",pod_name="test3"} 0.0

https://github.com/tkestack/gpu-manager 提供metric,IP:5678/metric默认未开启 *

metric namehelp说明k8s yaml def
container_gpu_memory_totalgpu memory usage in MiBpod的gpu显存使用量 MiB
container_request_gpu_memoryrequest of gpu memory in MiBpod申请的gpu显存总量 MiB
vcuda-memory: 30
申请显存 30 256MiB= 7680M
container_gpu_utilizationgpu utilization

pod的gpu算力使用率:

每块卡metric值为0-100,每块卡使用率以及所有卡使用率和


container_request_gpu_utilizationrequest of gpu utilization

pod申请的gpu算力(多少vgpu卡):

申请0.5块卡,metric值为0.5;申请2块卡,metric值为2

vcuda-core: 200
申请卡数 200 /100= 2

注意:一个pod申请: 0-100,或者100倍数

About

k8s pod gpu metric

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published