1 year ago · 83fde7b94b
--- a/utils/train_utils.py
+++ b/utils/train_utils.py
@@ -84,7 +84,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																                     if train_config.enable_fsdp:
															
 
																                         batch[key] = batch[key].to(local_rank)
															
 
																                     else:
															
 
																-                        batch[key] = batch[key].to('cuda')       
															
 
																+
															
 
																+                        batch[key] = batch[key].to('cuda:0')              
															
 
																                 loss = model(**batch).loss
															
 
																                 loss = loss / gradient_accumulation_steps
															
 
																                 total_loss += loss.detach().float()
															
@@ -198,7 +199,7 @@ def evaluation(model,train_config, eval_dataloader, local_rank, tokenizer):
 
																                 if train_config.enable_fsdp:
															
 
																                     batch[key] = batch[key].to(local_rank)
															
 
																                 else:
															
 
																-                    batch[key] = batch[key].to('cuda')
															
 
																+                    batch[key] = batch[key].to('cuda:0')
															
 
																             # Ensure no gradients are computed for this scope to save memory
															
 
																             with torch.no_grad():
															
 
																                 # Forward pass and compute loss