multi_node.slurm 1.1 KB

123456789101112131415161718192021222324252627282930313233343536
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the GNU General Public License version 3.
  3. #!/bin/bash
  4. #SBATCH --job-name=Nano-2d-trainer-20b-8nodes
  5. #SBATCH --ntasks=2
  6. #SBATCH --nodes=2
  7. #SBATCH --gpus-per-task=4
  8. #SBATCH --partition=train
  9. nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
  10. nodes_array=($nodes)
  11. head_node=${nodes_array[0]}
  12. head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
  13. # Enable for A100
  14. export FI_PROVIDER="efa"
  15. echo Node IP: $head_node_ip
  16. export LOGLEVEL=INFO
  17. # debugging flags (optional)
  18. export NCCL_DEBUG=WARN
  19. export NCCL_DEBUG_SUBSYS=WARN
  20. export PYTHONFAULTHANDLER=1
  21. export LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
  22. export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
  23. export CUDA_LAUNCH_BLOCKING=0
  24. # on your cluster you might need these:
  25. # set the network interface
  26. export NCCL_SOCKET_IFNAME="ens"
  27. export FI_EFA_USE_DEVICE_RDMA=1
  28. srun torchrun --nproc_per_node 4 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $head_node_ip:29500 ./finetuning.py --enable_fsdp --use_peft --peft_method lora