|
@@ -85,8 +85,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
batch[key] = batch[key].to(local_rank)
|
|
batch[key] = batch[key].to(local_rank)
|
|
else:
|
|
else:
|
|
batch[key] = batch[key].to('cuda')
|
|
batch[key] = batch[key].to('cuda')
|
|
- outputs = model(**batch)
|
|
|
|
- loss = outputs.loss
|
|
|
|
|
|
+ loss = model(**batch).loss
|
|
loss = loss / gradient_accumulation_steps
|
|
loss = loss / gradient_accumulation_steps
|
|
total_loss += loss.detach().float()
|
|
total_loss += loss.detach().float()
|
|
first_key = next(iter(batch))
|
|
first_key = next(iter(batch))
|
|
@@ -105,7 +104,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
optimizer.step()
|
|
optimizer.step()
|
|
optimizer.zero_grad()
|
|
optimizer.zero_grad()
|
|
|
|
|
|
- print(f"\n step {step} is completed and loss is {loss.detach().float()}")
|
|
|
|
|
|
+ print(f"\n step {step} is completed and loss is {loss.detach().float()}")
|
|
# Reducing total_loss across all devices if there's more than one CUDA device
|
|
# Reducing total_loss across all devices if there's more than one CUDA device
|
|
if torch.cuda.device_count() > 1 and train_config.enable_fsdp:
|
|
if torch.cuda.device_count() > 1 and train_config.enable_fsdp:
|
|
dist.all_reduce(total_loss, op=dist.ReduceOp.SUM)
|
|
dist.all_reduce(total_loss, op=dist.ReduceOp.SUM)
|