-
Notifications
You must be signed in to change notification settings - Fork 0
/
monitor_ib_traffic.py
106 lines (86 loc) · 3.78 KB
/
monitor_ib_traffic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Inspired by https://github.com/vpenso/ganglia-sensors/blob/master/lib/python_modules/infiniband.py#/
import logging
import re
import sys
import json
import time
import subprocess
METRIC_NAMES = ["PortXmitData","PortRcvData"]
metrics = {}
def decode_str_list(line_list):
return [x.decode("utf-8") for x in line_list]
def get_cmd_out(cmd):
return decode_str_list(subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.readlines())
def ibstat_ports():
lid7port = []
ibstat = get_cmd_out("ibstat")
for index,line in enumerate(ibstat):
line = line.strip()
match = re.match("Port [0-9]\:",line)
if match:
number = line.split(' ')[1].replace(':','')
state = ibstat[index+1].split(':')[1].strip()
an = re.match("Active",state)
if an:
lid = ibstat[index+4].split(':')[1].strip()
lid7port.append((lid, number))
return lid7port
# Return a key-value pair, eventually empty if the line didn't match
def parse_counter_line(line, keys):
if re.match("^[a-zA-z0-9]*\:\.\.\.*[0-9]*$",line):
line = line.split(':')
key = line[0]
if key in keys:
value = line[1].replace('.','').strip()
return (key, int(value))
return ("",0)
# Parse the complete input from perfquery for lines matching counters,
# and return all counters and their values as dictionary
def parse_counters(counters, keys):
counts = {}
for line in counters:
key, value = parse_counter_line(line, keys)
# Omit empty return values...
if key:
logging.debug("[parse_counters] Found counter: %s=%s", key, value)
counts[key] = value
return counts
# Call perfquery for extended traffic counters, and reset the counters
def traffic_counter(lid, port = 1):
command = ["/usr/sbin/perfquery", "-x", "-r", lid, port]
logging.debug("[traffic_counters] Execute command: %s", " ".join(command))
counters = get_cmd_out(command)
return parse_counters(counters, METRIC_NAMES)
def init_metric():
metrics["last_update"] = time.time()
def update_metric():
global metrics
# NOTE: time_since_last_update is not calculated precisely
time_since_last_update = time.time() - metrics["last_update"]
logging.debug("[update_metrics] Update metrics after %ss", time_since_last_update)
for lid, port in ibstat_ports():
metric2counts = traffic_counter(lid, port)
metrics[lid] = {port: metric2counts}
for metric in METRIC_NAMES:
# Data port counters indicate octets divided by 4 rather than just octets.
#
# It's consistent with what the IB spec says (IBA 1.2 vol 1 p.948) as to
# how these quantities are counted. They are defined to be octets divided
# by 4 so the choice is to display them the same as the actual quantity
# (which is why they are named Data rather than Octets) or to multiply by
# 4 for Octets. The former choice was made.
#
# For simplification the values are multiplied by 4 to represent octets/bytes
num_bytes = metric2counts[metric] * 4
metrics[lid][port][metric.replace("Data", "Bytes")] = num_bytes
metrics[lid][port][metric.replace("Data", "GB/s")] = num_bytes / (time_since_last_update * 1024*1024*1024)
metrics["last_update"] = time.time()
if __name__ == '__main__':
logging.root.setLevel(logging.INFO)
update_interval = 10 if len(sys.argv) == 1 else sys.argv[1] # default is 10s
init_metric()
while True:
update_metric()
print("Note: This is a **Rough** traffic monitor for Infiniband, the bw below may be bigger than real bw")
print(json.dumps(metrics, indent=2, sort_keys=True))
time.sleep(update_interval)