-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultiproc.py
89 lines (83 loc) · 3.25 KB
/
multiproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import paddle
import sys
import subprocess
import socket
import time
from argparse import ArgumentParser, REMAINDER
def parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(description=
'PyTorch distributed training launch helper utilty that will spawn up multiple distributed processes'
)
parser.add_argument('--nnodes', type=int, default=1, help=
'The number of nodes to use for distributed training')
parser.add_argument('--node_rank', type=int, default=0, help=
'The rank of the node for multi-node distributed training')
parser.add_argument('--nproc_per_node', type=int, default=1, help=
'The number of processes to launch on each node, for GPU training, this is recommended to be set to the number of GPUs in your system so that each process can be bound to a single GPU.'
)
parser.add_argument('--master_addr', default='127.0.0.1', type=str,
help=
"Master node (rank 0)'s address, should be either the IP address or the hostname of node 0, for single node multi-proc training, the --master_addr can simply be 127.0.0.1"
)
parser.add_argument('--master_port', default=29500, type=int, help=
"Master node (rank 0)'s free port that needs to be used for communciation during distributed training"
)
parser.add_argument('training_script', type=str, help=
'The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script'
)
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
dist_world_size = args.nproc_per_node * args.nnodes
current_env = os.environ.copy()
current_env['MASTER_ADDR'] = args.master_addr
current_env['MASTER_PORT'] = str(args.master_port)
current_env['WORLD_SIZE'] = str(dist_world_size)
processes = []
for local_rank in range(0, args.nproc_per_node):
dist_rank = args.nproc_per_node * args.node_rank + local_rank
current_env['RANK'] = str(dist_rank)
cmd = [sys.executable, '-u', args.training_script,
'--local_rank={}'.format(local_rank)] + args.training_script_args
print(cmd)
stdout = None if local_rank == 0 else open('GPU_' + str(local_rank) +
'.log', 'w')
process = subprocess.Popen(cmd, env=current_env, stdout=stdout)
processes.append(process)
try:
up = True
error = False
while up and not error:
up = False
for p in processes:
ret = p.poll()
if ret is None:
up = True
elif ret != 0:
error = True
time.sleep(1)
if error:
for p in processes:
if p.poll() is None:
p.terminate()
exit(1)
except KeyboardInterrupt:
for p in processes:
p.terminate()
raise
except SystemExit:
for p in processes:
p.terminate()
raise
except:
for p in processes:
p.terminate()
raise
if __name__ == '__main__':
main()