- request 4 GPU interactive on bridge 2
salloc -N 1 -p GPU-shared --gres=gpu:4 -q interactive
- request node como on rise
srun --nodelist=zanino -t 60:00 --pty bash
srun --nodelist=como -t 60:00 --pty bash
- check nvlink connection
nvidia-smi topo -m
# create config file
touch ~/.nccl.conf
# use level nccl runtime configuration
# nccl will read this file before run
vim ~/.nccl.conf
content of the config file
# DUMP topology
NCCL_TOPO_DUMP_FILE=`pwd`/topo.xml
# use simple proto instead of LL (low latency)
NCCL_PROTO=Simple
# show debug info
NCCL_DEBUG=Info
# debug subsystem
NCCL_DEBUG_SUBSYS=ALL
# choose algo
NCCL_ALGO=Ring
# debug file
NCCL_DEBUG_FILE=`pwd`/debugfile.%h.%p
# save graph file
NCCL_GRAPH_DUMP_FILE=`pwd`/graph.xml
# save profile result
/usr/local/cuda-11.5/bin/nvprof -f -o output.nvvp ./blinkplus_examples/run_blinkplus
# visual result
/usr/local/cuda-11.5/bin/nvvp output.nvvp