|
@@ -78,7 +78,6 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
|
model.train()
|
|
|
total_loss = 0.0
|
|
|
data_set_len = 0
|
|
|
-
|
|
|
for step, batch in enumerate(tqdm(train_dataloader,colour="blue", desc=f"Training Epoch{epoch}")):
|
|
|
for key in batch.keys():
|
|
|
if train_config.enable_fsdp:
|
|
@@ -116,6 +115,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
|
|
|
|
print(f"Max CUDA memory allocated was {memtrace.peak} GB")
|
|
|
print(f"Max CUDA memory reserved was {memtrace.max_reserved} GB")
|
|
|
+ print(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB")
|
|
|
print(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}")
|
|
|
print(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB")
|
|
|
|
|
@@ -151,7 +151,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
|
)
|
|
|
|
|
|
|
|
|
- if local_rank == 0 and eval_epoch_loss < best_val_loss:
|
|
|
+ if eval_epoch_loss < best_val_loss:
|
|
|
best_val_loss = eval_epoch_loss
|
|
|
print(f"best eval loss on epoch {epoch} is {best_val_loss}")
|
|
|
val_loss.append(best_val_loss)
|