|
@@ -78,13 +78,11 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
|
model.train()
|
|
|
total_loss = 0.0
|
|
|
data_set_len = 0
|
|
|
-
|
|
|
for step, batch in enumerate(tqdm(train_dataloader,colour="blue", desc=f"Training Epoch{epoch}")):
|
|
|
for key in batch.keys():
|
|
|
if train_config.enable_fsdp:
|
|
|
batch[key] = batch[key].to(local_rank)
|
|
|
else:
|
|
|
-
|
|
|
batch[key] = batch[key].to('cuda:0')
|
|
|
loss = model(**batch).loss
|
|
|
loss = loss / gradient_accumulation_steps
|
|
@@ -117,6 +115,7 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
|
|
|
|
|
|
print(f"Max CUDA memory allocated was {memtrace.peak} GB")
|
|
|
print(f"Max CUDA memory reserved was {memtrace.max_reserved} GB")
|
|
|
+ print(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB")
|
|
|
print(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}")
|
|
|
print(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB")
|
|
|
|