-
Notifications
You must be signed in to change notification settings - Fork 0
/
running.txt
77 lines (36 loc) · 3.6 KB
/
running.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# set up the required modules
module load gcc/5.5.0
module load cuda/11.1
# install torch for A100
pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
# install apex
git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
# install timm
pip install timm
# optional, for FLOPs calculation
pip install fvcore
# optional mmcv, mmdetection, mmsegmentation for detection and segmentation
# processing ImageNet: https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
# prepare data
https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar
https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
```````to run```````
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_7x7 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_7x7.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_9x9 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_9x9.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_3x3_7x7 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_3x3_7x7.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_3x3 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_3x3.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_5x5 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_5x5.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_3x3dilated2 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_3x3dilated2.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model5_static_shiftformer_s12 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model5_static_shiftformer_s12.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model6_static_shiftformer_s12 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model6_static_shiftformer_s12.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model1_static_shiftformer_s12_n8 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model1_static_shiftformer_s12_n8.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model1_static_shiftformer_s12_n16 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model1_static_shiftformer_s12_n16.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model4_static_shiftformer_s12 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model4_static_shiftformer_s12.out &
MODEL=poolformer_s36
DROP_PATH=0.2
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model $MODEL -b 128 --lr 1e-3 --drop-path $DROP_PATH --apex-amp > train_poolformer_s36.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model $MODEL -b 128 --lr 1e-3 --drop-path $DROP_PATH --apex-amp > train_poolformer_s12.out &
SPACH
nohup python -m torch.distributed.launch --nproc_per_node 8 --use_env main.py --model shiftvit_r4_base --data-path /dev/shm/imagenet/ --output_dir checkpoint/shiftvit_r4_base --dist-eval > train_shiftvit_r4_base.out &