Browse Source

Adding tests for the save_metrics param in the train function

Beto 1 year ago
parent
commit
1f5b202c18
2 changed files with 56 additions and 3 deletions
  1. 5 3
      src/llama_recipes/utils/train_utils.py
  2. 51 0
      tests/test_train_utils.py

+ 5 - 3
src/llama_recipes/utils/train_utils.py

@@ -69,7 +69,6 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
     val_loss =[]
 
     if train_config.save_metrics:
-        print(f"Save metrics TRUE {train_config.save_metrics}")
         metrics_filename = f"{train_config.output_dir}/metrics_data_{local_rank}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
         train_step_perplexity = []
         train_step_loss = []
@@ -96,8 +95,9 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
                 with autocast():
                     loss = model(**batch).loss
                 loss = loss / gradient_accumulation_steps
-                train_step_loss.append(loss.detach().float().item())
-                train_step_perplexity.append(float(torch.exp(loss.detach().float())))
+                if train_config.save_metrics:
+                    train_step_loss.append(loss.detach().float().item())
+                    train_step_perplexity.append(float(torch.exp(loss.detach().float())))
                 total_loss += loss.detach().float()
                 if train_config.use_fp16:
                     # if fp16 is enabled, use gradient scaler to handle gradient update
@@ -245,6 +245,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
         results['avg_eval_loss'] = avg_eval_loss
     results["avg_epoch_time"] = avg_epoch_time
     results["avg_checkpoint_time"] = avg_checkpoint_time
+    if train_config.save_metrics:
+        results["metrics_filename"] = metrics_filename
 
     #saving the training params including fsdp setting for reference.
     if train_config.enable_fsdp and not train_config.use_peft:

+ 51 - 0
tests/test_train_utils.py

@@ -2,11 +2,27 @@
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
 from unittest.mock import patch
+import pytest
 
 import torch
 
+import os
+import shutil
+
 from llama_recipes.utils.train_utils import train
 
+TEMP_OUTPUT_DIR = os.getcwd() + "/tmp"
+
+@pytest.fixture(scope="session")
+def temp_output_dir():
+    # Create the directory during the session-level setup
+    temp_output_dir = "tmp"
+    os.mkdir(os.path.join(os.getcwd(), temp_output_dir))
+    yield temp_output_dir
+    # Delete the directory during the session-level teardown
+    shutil.rmtree(temp_output_dir)
+
+
 @patch("llama_recipes.utils.train_utils.MemoryTrace")
 @patch("llama_recipes.utils.train_utils.nullcontext")
 @patch("llama_recipes.utils.train_utils.torch.cuda.amp.GradScaler")
@@ -28,6 +44,7 @@ def test_gradient_accumulation(autocast, scaler, nullcontext, mem_trace, mocker)
     train_config.use_fp16 = False
     train_config.run_validation = False
     train_config.gradient_clipping = False
+    train_config.save_metrics = False
 
     train(
         model,
@@ -63,3 +80,37 @@ def test_gradient_accumulation(autocast, scaler, nullcontext, mem_trace, mocker)
     assert optimizer.zero_grad.call_count == 3
     assert nullcontext.call_count == 0
     assert autocast.call_count == 5
+
+def test_save_to_json(temp_output_dir, mocker):
+    model = mocker.MagicMock(name="model")
+    model().loss.__truediv__().detach.return_value = torch.tensor(1)
+    mock_tensor = mocker.MagicMock(name="tensor")
+    batch = {"input": mock_tensor}
+    train_dataloader = [batch, batch, batch, batch, batch]
+    eval_dataloader = None
+    tokenizer = mocker.MagicMock()
+    optimizer = mocker.MagicMock()
+    lr_scheduler = mocker.MagicMock()
+    gradient_accumulation_steps = 1
+    train_config = mocker.MagicMock()
+    train_config.enable_fsdp = False
+    train_config.use_fp16 = False
+    train_config.run_validation = False
+    train_config.gradient_clipping = False
+    train_config.save_metrics = True
+    train_config.output_dir = temp_output_dir
+
+    results = train(
+        model,
+        train_dataloader,
+        eval_dataloader,
+        tokenizer,
+        optimizer,
+        lr_scheduler,
+        gradient_accumulation_steps,
+        train_config,
+        local_rank=0
+    )
+
+    assert results["metrics_filename"] not in ["", None]
+