|
@@ -84,9 +84,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
if train_config.enable_fsdp:
|
|
if train_config.enable_fsdp:
|
|
batch[key] = batch[key].to(local_rank)
|
|
batch[key] = batch[key].to(local_rank)
|
|
else:
|
|
else:
|
|
- batch[key] = batch[key].to('cuda')
|
|
|
|
- outputs = model(**batch)
|
|
|
|
- loss = outputs.loss
|
|
|
|
|
|
+ batch[key] = batch[key].to('cuda:0')
|
|
|
|
+ loss = model(**batch).loss
|
|
loss = loss / gradient_accumulation_steps
|
|
loss = loss / gradient_accumulation_steps
|
|
total_loss += loss.detach().float()
|
|
total_loss += loss.detach().float()
|
|
first_key = next(iter(batch))
|
|
first_key = next(iter(batch))
|
|
@@ -105,7 +104,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
optimizer.step()
|
|
optimizer.step()
|
|
optimizer.zero_grad()
|
|
optimizer.zero_grad()
|
|
|
|
|
|
- print(f"\n step {step} is completed and loss is {loss.detach().float()}")
|
|
|
|
|
|
+ print(f"\n step {step} is completed and loss is {loss.detach().float()}")
|
|
# Reducing total_loss across all devices if there's more than one CUDA device
|
|
# Reducing total_loss across all devices if there's more than one CUDA device
|
|
if torch.cuda.device_count() > 1 and train_config.enable_fsdp:
|
|
if torch.cuda.device_count() > 1 and train_config.enable_fsdp:
|
|
dist.all_reduce(total_loss, op=dist.ReduceOp.SUM)
|
|
dist.all_reduce(total_loss, op=dist.ReduceOp.SUM)
|
|
@@ -117,6 +116,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
|
|
|
|
print(f"Max CUDA memory allocated was {memtrace.peak} GB")
|
|
print(f"Max CUDA memory allocated was {memtrace.peak} GB")
|
|
print(f"Max CUDA memory reserved was {memtrace.max_reserved} GB")
|
|
print(f"Max CUDA memory reserved was {memtrace.max_reserved} GB")
|
|
|
|
+ print(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB")
|
|
print(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}")
|
|
print(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}")
|
|
print(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB")
|
|
print(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB")
|
|
|
|
|
|
@@ -202,7 +202,7 @@ def evaluation(model,train_config, eval_dataloader, local_rank, tokenizer):
|
|
if train_config.enable_fsdp:
|
|
if train_config.enable_fsdp:
|
|
batch[key] = batch[key].to(local_rank)
|
|
batch[key] = batch[key].to(local_rank)
|
|
else:
|
|
else:
|
|
- batch[key] = batch[key].to('cuda')
|
|
|
|
|
|
+ batch[key] = batch[key].to('cuda:0')
|
|
# Ensure no gradients are computed for this scope to save memory
|
|
# Ensure no gradients are computed for this scope to save memory
|
|
with torch.no_grad():
|
|
with torch.no_grad():
|
|
# Forward pass and compute loss
|
|
# Forward pass and compute loss
|