-
Notifications
You must be signed in to change notification settings - Fork 1
/
scheduler_sc100m.sh
47 lines (46 loc) · 1.66 KB
/
scheduler_sc100m.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
echo "=========================================="
echo "Please run the script as: "
echo "bash run_gpu_cluster.sh DATA_PATH"
echo "For example: bash run_gpu_cluster.sh /path/dataset"
echo "It is better to use the absolute path."
echo "==========================================="
# export ASCEND_GLOBAL_LOG_LEVEL=2
# export SLOG_PRINT_TO_STDOUT=2
export MS_ENABLE_FORMAT_MODE=1
export MS_HCCL_CM_INIT=1
export HCCL_DETERMINISTIC=1
# export MINDSPORE_DUMP_CONFIG='/share-nfs/w50035851/code/msver/dump.json'
data='cancer'
start=$4
dir=device$((start/8+1))
rm -rf log/fin*.txt
rm -rf $dir
mkdir $dir
cp ./*.py ./$dir
cd $dir
rm -rf rank*
rm *.log
date
echo "start training"
ttl=32
num=8
ip=$3
batch=4
port=8448
# 循环启动8个Worker训练进程
export MS_WORKER_NUM=$ttl # 设置集群中Worker进程数量为8
export MS_SCHED_HOST=61.47.2.$ip # 设置Scheduler IP地址为本地环路地址
# export MS_SCHED_HOST=127.0.0.1 # 设置Scheduler IP地址为本地环路地址
export MS_SCHED_PORT=$port # 设置Scheduler端口
export MS_ROLE=MS_SCHED # 设置启动的进程为MS_SCHED角色
python ./1B_$5train.py --dist --data $1 --batch $batch --data $data > scheduler.log 2>&1 &
for((i=1;i<$num;i++));
do
export MS_ROLE=MS_WORKER # 设置启动的进程为MS_WORKER角色
export MS_NODE_ID=$i # 设置进程id,可选
python ./1B_train.py --dist --data $1 --batch $batch --data $data > worker_$i.log 2>&1 &
done
export MS_ROLE=MS_WORKER # 设置启动的进程为MS_WORKER角色
export MS_NODE_ID=0 # 设置进程id,可选
python ./1B_train.py --dist --data $1 --batch $batch --data $data