forked from conweave-project/conweave-ns3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
431 lines (376 loc) · 18.6 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#!/usr/bin/python3
from genericpath import exists
import subprocess
import os
import time
from xmlrpc.client import boolean
import numpy as np
import copy
import shutil
import random
from datetime import datetime
import sys
import os
import argparse
from datetime import date
# randomID
random.seed(datetime.now())
MAX_RAND_RANGE = 1000000000
# config template
config_template = """TOPOLOGY_FILE config/{topo}.txt
FLOW_FILE config/{flow}.txt
FLOW_INPUT_FILE mix/output/{id}/{id}_in.txt
CNP_OUTPUT_FILE mix/output/{id}/{id}_out_cnp.txt
FCT_OUTPUT_FILE mix/output/{id}/{id}_out_fct.txt
PFC_OUTPUT_FILE mix/output/{id}/{id}_out_pfc.txt
QLEN_MON_FILE mix/output/{id}/{id}_out_qlen.txt
VOQ_MON_FILE mix/output/{id}/{id}_out_voq.txt
VOQ_MON_DETAIL_FILE mix/output/{id}/{id}_out_voq_per_dst.txt
UPLINK_MON_FILE mix/output/{id}/{id}_out_uplink.txt
CONN_MON_FILE mix/output/{id}/{id}_out_conn.txt
EST_ERROR_MON_FILE mix/output/{id}/{id}_out_est_error.txt
QLEN_MON_START {qlen_mon_start}
QLEN_MON_END {qlen_mon_end}
SW_MONITORING_INTERVAL {sw_monitoring_interval}
FLOWGEN_START_TIME {flowgen_start_time}
FLOWGEN_STOP_TIME {flowgen_stop_time}
BUFFER_SIZE {buffer_size}
CC_MODE {cc_mode}
LB_MODE {lb_mode}
ENABLE_PFC {enabled_pfc}
ENABLE_IRN {enabled_irn}
CONWEAVE_TX_EXPIRY_TIME {cwh_tx_expiry_time}
CONWEAVE_REPLY_TIMEOUT_EXTRA {cwh_extra_reply_deadline}
CONWEAVE_PATH_PAUSE_TIME {cwh_path_pause_time}
CONWEAVE_EXTRA_VOQ_FLUSH_TIME {cwh_extra_voq_flush_time}
CONWEAVE_DEFAULT_VOQ_WAITING_TIME {cwh_default_voq_waiting_time}
ALPHA_RESUME_INTERVAL 1
RATE_DECREASE_INTERVAL 4
CLAMP_TARGET_RATE 0
RP_TIMER 300
FAST_RECOVERY_TIMES 1
EWMA_GAIN {ewma_gain}
RATE_AI {ai}Mb/s
RATE_HAI {hai}Mb/s
MIN_RATE 100Mb/s
DCTCP_RATE_AI {dctcp_ai}Mb/s
ERROR_RATE_PER_LINK 0.0000
L2_CHUNK_SIZE 4000
L2_ACK_INTERVAL 1
L2_BACK_TO_ZERO 0
RATE_BOUND 1
HAS_WIN {has_win}
VAR_WIN {var_win}
FAST_REACT {fast_react}
MI_THRESH {mi}
INT_MULTI {int_multi}
GLOBAL_T 1
U_TARGET 0.95
MULTI_RATE 0
SAMPLE_FEEDBACK 0
ENABLE_QCN 1
USE_DYNAMIC_PFC_THRESHOLD 1
PACKET_PAYLOAD_SIZE 1000
LINK_DOWN 0 0 0
KMAX_MAP {kmax_map}
KMIN_MAP {kmin_map}
PMAX_MAP {pmax_map}
LOAD {load}
RANDOM_SEED 1
"""
# LB/CC mode matching
cc_modes = {
"dcqcn": 1,
"hpcc": 3,
"timely": 7,
"dctcp": 8,
}
lb_modes = {
"fecmp": 0,
"drill": 2,
"conga": 3,
"letflow": 6,
"conweave": 9,
}
topo2bdp = {
"leaf_spine_128_100G_OS2": 104000, # 2-tier -> all 100Gbps
"fat_k8_100G_OS2": 156000, # 3-tier -> all 100Gbps
}
FLOWGEN_DEFAULT_TIME = 2.0 # see /traffic_gen/traffic_gen.py::base_t
def main():
# make directory if not exists
isExist = os.path.exists(os.getcwd() + "/mix/output/")
if not isExist:
os.makedirs(os.getcwd() + "/mix/output/")
print("The new directory is created - {}".format(os.getcwd() + "/mix/output/"))
parser = argparse.ArgumentParser(description='run simulation')
parser.add_argument('--cc', dest='cc', action='store',
default='dcqcn', help="hpcc/dcqcn/timely/dctcp (default: dcqcn)")
parser.add_argument('--lb', dest='lb', action='store',
default='fecmp', help="fecmp/pecmp/drill/conga (default: fecmp)")
parser.add_argument('--pfc', dest='pfc', action='store',
type=int, default=1, help="enable PFC (default: 1)")
parser.add_argument('--irn', dest='irn', action='store',
type=int, default=0, help="enable IRN (default: 0)")
parser.add_argument('--simul_time', dest='simul_time', action='store',
default='0.1', help="traffic time to simulate (up to 3 seconds) (default: 0.1)")
parser.add_argument('--buffer', dest="buffer", action='store',
default='9', help="the switch buffer size (MB) (default: 9)")
parser.add_argument('--netload', dest='netload', action='store', type=int,
default=40, help="Network load at NIC to generate traffic (default: 40.0)")
parser.add_argument('--bw', dest="bw", action='store',
default='100', help="the NIC bandwidth (Gbps) (default: 100)")
parser.add_argument('--topo', dest='topo', action='store',
default='leaf_spine_128_100G', help="the name of the topology file (default: leaf_spine_128_100G_OS2)")
parser.add_argument('--cdf', dest='cdf', action='store',
default='AliStorage2019', help="the name of the cdf file (default: AliStorage2019)")
parser.add_argument('--enforce_win', dest='enforce_win', action='store',
type=int, default=0, help="enforce to use window scheme (default: 0)")
parser.add_argument('--sw_monitoring_interval', dest='sw_monitoring_interval', action='store',
type=int, default=10000, help="interval of sampling statistics for queue status (default: 10000ns)")
# #### CONWEAVE PARAMETERS ####
# parser.add_argument('--cwh_extra_reply_deadline', dest='cwh_extra_reply_deadline', action='store',
# type=int, default=4, help="extra-timeout, where reply_deadline = base-RTT + extra-timeout (default: 4us)")
# parser.add_argument('--cwh_path_pause_time', dest='cwh_path_pause_time', action='store',
# type=int, default=16, help="Time to pause the path with ECN feedback (default: 8us")
# parser.add_argument('--cwh_extra_voq_flush_time', dest='cwh_extra_voq_flush_time', action='store',
# type=int, default=16, help="Extra VOQ Flush Time (default: 8us for IRN)")
# parser.add_argument('--cwh_default_voq_waiting_time', dest='cwh_default_voq_waiting_time', action='store',
# type=int, default=400, help="Default VOQ Waiting Time (default: 400us)")
# parser.add_argument('--cwh_tx_expiry_time', dest='cwh_tx_expiry_time', action='store',
# type=int, default=1000, help="timeout value of ConWeave Tx for CLEAR signal (default: 1000us)")
args = parser.parse_args()
# make running ID of this config
# need to check directory exists or not
isExist = True
config_ID = 0
while (isExist):
config_ID = str(random.randrange(MAX_RAND_RANGE))
isExist = os.path.exists(os.getcwd() + "/mix/output/" + config_ID)
# input parameters
cc_mode = cc_modes[args.cc]
lb_mode = lb_modes[args.lb]
enabled_pfc = int(args.pfc)
enabled_irn = int(args.irn)
bw = int(args.bw)
buffer = args.buffer
topo = args.topo
enforce_win = args.enforce_win
cdf = args.cdf
flowgen_start_time = FLOWGEN_DEFAULT_TIME # default: 2.0
flowgen_stop_time = flowgen_start_time + \
float(args.simul_time) # default: 2.0
sw_monitoring_interval = int(args.sw_monitoring_interval)
# get over-subscription ratio from topoogy name
netload = args.netload
oversub = int(topo.replace("\n", "").split("OS")[-1].replace(".txt", ""))
assert (int(args.netload) % oversub == 0)
hostload = int(args.netload) / oversub
assert (hostload > 0)
# Sanity checks
if (args.cc == "timely" or args.cc == "hpcc") and args.lb == "conweave":
raise Exception(
"CONFIG ERROR : ConWeave currently does not support RTT-based protocols. Plz modify its logic accordingly.")
if enabled_irn == 1 and enabled_pfc == 1:
raise Exception(
"CONFIG ERROR : If IRN is turn-on, then you should turn off PFC (for better perforamnce).")
if enabled_irn == 0 and enabled_pfc == 0:
raise Exception(
"CONFIG ERROR : Either IRN or PFC should be true (at least one).")
if float(args.simul_time) < 0.005:
raise Exception("CONFIG ERROR : Runtime must be larger than 5ms (= warmup interval).")
# sniff number of servers
with open("config/{topo}.txt".format(topo=args.topo), 'r') as f_topo:
line = f_topo.readline().split(" ")
n_host = int(line[0]) - int(line[1])
assert (hostload >= 0 and hostload < 100)
flow = "L_{load:.2f}_CDF_{cdf}_N_{n_host}_T_{time}ms_B_{bw}_flow".format(
load=hostload, cdf=args.cdf, n_host=n_host, time=int(float(args.simul_time)*1000), bw=bw)
# check the file exists
if (exists(os.getcwd() + "/config/" + flow + ".txt")):
print("Input traffic file with load:{load:.2f}, cdf:{cdf}, n_host:{n_host} already exists".format(
load=hostload, cdf=cdf, n_host=n_host))
else: # make the input traffic file
print("Generate a input traffic file...")
print("python ./traffic_gen/traffic_gen.py -c {cdf} -n {n_host} -l {load} -b {bw} -t {time} -o {output}".format(
cdf=os.getcwd() + "/../traffic_gen/" + args.cdf + ".txt",
n_host=n_host,
load=hostload / 100.0,
bw=args.bw + "G",
time=args.simul_time,
output=os.getcwd() + "/config/" + flow + ".txt"))
os.system("python ./traffic_gen/traffic_gen.py -c {cdf} -n {n_host} -l {load} -b {bw} -t {time} -o {output}".format(
cdf=os.getcwd() + "/traffic_gen/" + args.cdf + ".txt",
n_host=n_host,
load=hostload / 100.0,
bw=args.bw + "G",
time=args.simul_time,
output=os.getcwd() + "/config/" + flow + ".txt"))
# sanity check - bandwidth
with open("config/{topo}.txt".format(topo=args.topo), 'r') as f_topo:
first_line = f_topo.readline().split(" ")
n_host = int(first_line[0]) - int(first_line[1])
n_link = int(first_line[2])
i = 0
for line in f_topo.readlines()[1:]:
i += 1
if (i > n_link):
break
parsed = line.split(" ")
if len(parsed) > 2 and (int(parsed[0]) < n_host or int(parsed[1]) < n_host):
assert (int(parsed[2].replace("Gbps", "")) == int(bw))
print("All NIC bandwidth is {bw}Gbps".format(bw=bw))
##################################################################
########## ConWeave parameters ##########
##################################################################
if (lb_mode == 9):
cwh_extra_reply_deadline = 4 # 4us, NOTE: this is "extra" term to base RTT
cwh_path_pause_time = 16 # 8us (K_min) or 16us
if "leaf_spine" in topo: # 2-tier
cwh_extra_voq_flush_time = 16
cwh_default_voq_waiting_time = 200
cwh_tx_expiry_time = 300 # 300us
elif "fat" in topo and enabled_pfc == 0 and enabled_irn == 1: # 3-tier, IRN
cwh_extra_voq_flush_time = 16
cwh_default_voq_waiting_time = 300
cwh_tx_expiry_time = 1000 # 1ms
elif "fat" in topo and enabled_pfc == 1 and enabled_irn == 0: # 3-tier, Lossless
cwh_extra_voq_flush_time = 64
cwh_default_voq_waiting_time = 600
cwh_tx_expiry_time = 1000 # 1ms
else:
raise Exception(
"Unsupported ConWeave Parameter Setup")
else:
#### CONWEAVE PARAMETERS (DUMMY) ####
cwh_extra_reply_deadline = 4
cwh_path_pause_time = 16
cwh_extra_voq_flush_time = 64
cwh_default_voq_waiting_time = 400
cwh_tx_expiry_time = 1000
##################################################################
# make directory if not exists
isExist = os.path.exists(os.getcwd() + "/mix/output/" + config_ID + "/")
assert (not isExist)
# if not isExist:
os.makedirs(os.getcwd() + "/mix/output/" + config_ID + "/")
print("The new directory is created - {}".format(os.getcwd() +
"/mix/output/" + config_ID + "/"))
config_name = os.getcwd() + "/mix/output/" + config_ID + "/config.txt"
print("Config filename:{}".format(config_name))
# By default, DCQCN uses no window (rate-based).
has_win = 0
var_win = 0
if (cc_mode == 3 or cc_mode == 8 or enforce_win == 1): # HPCC or DCTCP or enforcement
has_win = 1
var_win = 1
if enforce_win == 1:
print("### INFO: Enforced to use window scheme! ###")
# record to history
simulday = datetime.now().strftime("%m/%d/%y")
with open("./mix/.history", "a") as history:
history.write("{simulday},{config_ID},{cc_mode},{lb_mode},{cwh_tx_expiry_time},{cwh_extra_reply_deadline},{cwh_path_pause_time},{cwh_extra_voq_flush_time},{cwh_default_voq_waiting_time},{pfc},{irn},{has_win},{var_win},{topo},{bw},{cdf},{load},{time}\n".format(
simulday=simulday,
config_ID=config_ID,
cc_mode=cc_mode,
lb_mode=lb_mode,
cwh_tx_expiry_time=cwh_tx_expiry_time,
cwh_extra_reply_deadline=cwh_extra_reply_deadline,
cwh_path_pause_time=cwh_path_pause_time,
cwh_extra_voq_flush_time=cwh_extra_voq_flush_time,
cwh_default_voq_waiting_time=cwh_default_voq_waiting_time,
pfc=enabled_pfc,
irn=enabled_irn,
has_win=has_win,
var_win=var_win,
topo=topo,
bw=bw,
cdf=cdf,
load=netload,
time=args.simul_time,
))
# 1 BDP calculation
if topo2bdp.get(topo) == None:
print("ERROR - topology is not registered in run.py!!", flush=True)
return
bdp = int(topo2bdp[topo])
print("1BDP = {}".format(bdp))
# DCQCN parameters (NOTE: HPCC's 400KB/1600KB is too large, although used in Microsoft)
kmax_map = "6 %d %d %d %d %d %d %d %d %d %d %d %d" % (
bw*200000000, 400, bw*500000000, 400, bw*1000000000, 400, bw*2*1000000000, 400, bw*2500000000, 400, bw*4*1000000000, 400)
kmin_map = "6 %d %d %d %d %d %d %d %d %d %d %d %d" % (
bw*200000000, 100, bw*500000000, 100, bw*1000000000, 100, bw*2*1000000000, 100, bw*2500000000, 100, bw*4*1000000000, 100)
pmax_map = "6 %d %d %d %d %d %.2f %d %.2f %d %.2f %d %.2f" % (
bw*200000000, 0.2, bw*500000000, 0.2, bw*1000000000, 0.2, bw*2*1000000000, 0.2, bw*2500000000, 0.2, bw*4*1000000000, 0.2)
# queue monitoring
qlen_mon_start = flowgen_start_time
qlen_mon_end = flowgen_stop_time
if (cc_mode == 1): # DCQCN
ai = 10 * bw / 25
hai = 25 * bw / 25
dctcp_ai = 1000
fast_react = 0
mi = 0
int_multi = 1
ewma_gain = 0.00390625
config = config_template.format(id=config_ID, topo=topo, flow=flow,
qlen_mon_start=qlen_mon_start, qlen_mon_end=qlen_mon_end, flowgen_start_time=flowgen_start_time,
flowgen_stop_time=flowgen_stop_time, sw_monitoring_interval=sw_monitoring_interval,
load=netload, buffer_size=buffer, lb_mode=lb_mode, cwh_tx_expiry_time=cwh_tx_expiry_time,
cwh_extra_reply_deadline=cwh_extra_reply_deadline, cwh_default_voq_waiting_time=cwh_default_voq_waiting_time,
cwh_path_pause_time=cwh_path_pause_time, cwh_extra_voq_flush_time=cwh_extra_voq_flush_time,
enabled_pfc=enabled_pfc, enabled_irn=enabled_irn,
cc_mode=cc_mode,
ai=ai, hai=hai, dctcp_ai=dctcp_ai,
has_win=has_win, var_win=var_win,
fast_react=fast_react, mi=mi, int_multi=int_multi, ewma_gain=ewma_gain,
kmax_map=kmax_map, kmin_map=kmin_map, pmax_map=pmax_map)
else:
print("unknown cc:{}".format(args.cc))
with open(config_name, "w") as file:
file.write(config)
# run program
print("Running simulation...")
output_log = config_name.replace(".txt", ".log")
run_command = "./waf --run 'scratch/network-load-balance {config_name}' > {output_log} 2>&1".format(
config_name=config_name, output_log=output_log)
with open("./mix/.history", "a") as history:
history.write(run_command + "\n")
history.write(
"./waf --run 'scratch/network-load-balance' --command-template='gdb --args %s {config_name}'\n".format(
config_name=config_name)
)
history.write("\n")
print(run_command)
os.system("./waf --run 'scratch/network-load-balance {config_name}' > {output_log} 2>&1".format(
config_name=config_name, output_log=output_log))
####################################################
# Analyze the output FCT #
####################################################
# NOTE: collect data except warm-up and cold-finish period
fct_analysis_time_limit_begin = int(
flowgen_start_time * 1e9) + int(0.005 * 1e9) # warmup
fct_analysistime_limit_end = int(
flowgen_stop_time * 1e9) + int(0.05 * 1e9) # extra term
print("Analyzing output FCT...")
print("python3 fctAnalysis.py -id {config_ID} -dir {dir} -bdp {bdp} -sT {fct_analysis_time_limit_begin} -fT {fct_analysistime_limit_end} > /dev/null 2>&1".format(
config_ID=config_ID, dir=os.getcwd(), bdp=bdp, fct_analysis_time_limit_begin=fct_analysis_time_limit_begin, fct_analysistime_limit_end=fct_analysistime_limit_end))
os.system("python3 fctAnalysis.py -id {config_ID} -dir {dir} -bdp {bdp} -sT {fct_analysis_time_limit_begin} -fT {fct_analysistime_limit_end} > /dev/null 2>&1".format(
config_ID=config_ID, dir=os.getcwd(), bdp=bdp, fct_analysis_time_limit_begin=fct_analysis_time_limit_begin, fct_analysistime_limit_end=fct_analysistime_limit_end))
if lb_mode == 9: # ConWeave Logging
################################################################
# Analyze hardware resource of ConWeave #
################################################################
# NOTE: collect data except warm-up and cold-finish period
queue_analysis_time_limit_begin = int(
flowgen_start_time * 1e9) + int(0.005 * 1e9) # warmup
queue_analysistime_limit_end = int(flowgen_stop_time * 1e9)
print("Analyzing output Queue...")
print("python3 queueAnalysis.py -id {config_ID} -dir {dir} -sT {queue_analysis_time_limit_begin} -fT {queue_analysistime_limit_end} > /dev/null 2>&1".format(
config_ID=config_ID, dir=os.getcwd(), queue_analysis_time_limit_begin=queue_analysis_time_limit_begin, queue_analysistime_limit_end=queue_analysistime_limit_end))
os.system("python3 queueAnalysis.py -id {config_ID} -dir {dir} -sT {queue_analysis_time_limit_begin} -fT {queue_analysistime_limit_end} > /dev/null 2>&1".format(
config_ID=config_ID, dir=os.getcwd(), queue_analysis_time_limit_begin=queue_analysis_time_limit_begin, queue_analysistime_limit_end=queue_analysistime_limit_end,
monitoringInterval=sw_monitoring_interval)) # TODO: parameterize
print("\n\n============== Done ============== ")
if __name__ == "__main__":
main()