running.txt

# set up the required modules
module load gcc/5.5.0
module load cuda/11.1

# install torch for A100
pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

# install apex
git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

# install timm
pip install timm

# optional, for FLOPs calculation
pip install fvcore

# optional mmcv, mmdetection, mmsegmentation for detection and segmentation

# processing ImageNet: https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4


# prepare data
https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar
https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar


```````to run```````


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_7x7 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_7x7.out &

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_9x9 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_9x9.out &

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_3x3_7x7 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_3x3_7x7.out &


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_3x3 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_3x3.out &


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_5x5 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_5x5.out &


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model9_s12_3x3dilated2 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model9_s12_3x3dilated2.out &


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model5_static_shiftformer_s12 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model5_static_shiftformer_s12.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model6_static_shiftformer_s12 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model6_static_shiftformer_s12.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model1_static_shiftformer_s12_n8 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model1_static_shiftformer_s12_n8.out &
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model1_static_shiftformer_s12_n16 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model1_static_shiftformer_s12_n16.out &


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model model4_static_shiftformer_s12 -b 128 --lr 1e-3 --drop-path 0.1 --apex-amp > train_model4_static_shiftformer_s12.out &

MODEL=poolformer_s36
DROP_PATH=0.2
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model $MODEL -b 128 --lr 1e-3 --drop-path $DROP_PATH --apex-amp > train_poolformer_s36.out &


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 nohup ./distributed_train.sh 8 /dev/shm/imagenet/ --model $MODEL -b 128 --lr 1e-3 --drop-path $DROP_PATH --apex-amp > train_poolformer_s12.out &

SPACH
nohup python -m torch.distributed.launch --nproc_per_node 8 --use_env main.py --model shiftvit_r4_base --data-path /dev/shm/imagenet/ --output_dir checkpoint/shiftvit_r4_base --dist-eval > train_shiftvit_r4_base.out &