|
@@ -19,19 +19,19 @@ def check_padded_entry(batch):
|
|
|
|
|
|
@pytest.mark.skip_missing_tokenizer
|
|
|
@patch('llama_recipes.finetuning.train')
|
|
|
-@patch('llama_recipes.finetuning.LlamaTokenizer')
|
|
|
+@patch('llama_recipes.finetuning.AutoTokenizer')
|
|
|
@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
|
|
|
@patch('llama_recipes.finetuning.optim.AdamW')
|
|
|
@patch('llama_recipes.finetuning.StepLR')
|
|
|
-def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker, setup_tokenizer):
|
|
|
+def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker, setup_tokenizer, llama_version):
|
|
|
from llama_recipes.finetuning import main
|
|
|
|
|
|
setup_tokenizer(tokenizer)
|
|
|
|
|
|
kwargs = {
|
|
|
"dataset": "custom_dataset",
|
|
|
- "model_name": "meta-llama/Llama-2-7b-hf",
|
|
|
- "custom_dataset.file": "examples/custom_dataset.py",
|
|
|
+ "model_name": llama_version,
|
|
|
+ "custom_dataset.file": "recipes/finetuning/datasets/custom_dataset.py",
|
|
|
"custom_dataset.train_split": "validation",
|
|
|
"batch_size_training": 2,
|
|
|
"val_batch_size": 4,
|
|
@@ -63,12 +63,11 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
|
|
|
check_padded_entry(batch)
|
|
|
|
|
|
it = iter(train_dataloader)
|
|
|
- for _ in range(5):
|
|
|
- next(it)
|
|
|
+ next(it)
|
|
|
|
|
|
batch = next(it)
|
|
|
STRING = tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True)
|
|
|
- EXPECTED_STRING = "[INST] How do I initialize a Typescript project using npm and git? [/INST] # Initialize a new NPM project"
|
|
|
+ EXPECTED_STRING = "[INST] Quiero preparar una pizza de pepperoni, puedes darme los pasos para hacerla? [/INST] Claro!"
|
|
|
assert STRING.startswith(EXPECTED_STRING)
|
|
|
|
|
|
assert batch["input_ids"].size(0) == 2
|
|
@@ -80,7 +79,7 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
|
|
|
|
|
|
@patch('llama_recipes.finetuning.train')
|
|
|
@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
|
|
|
-@patch('llama_recipes.finetuning.LlamaTokenizer.from_pretrained')
|
|
|
+@patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
|
|
|
@patch('llama_recipes.finetuning.optim.AdamW')
|
|
|
@patch('llama_recipes.finetuning.StepLR')
|
|
|
def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train, mocker):
|
|
@@ -90,7 +89,7 @@ def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train,
|
|
|
|
|
|
kwargs = {
|
|
|
"dataset": "custom_dataset",
|
|
|
- "custom_dataset.file": "examples/custom_dataset.py:get_unknown_dataset",
|
|
|
+ "custom_dataset.file": "recipes/finetuning/datasets/custom_dataset.py:get_unknown_dataset",
|
|
|
"batch_size_training": 1,
|
|
|
"use_peft": False,
|
|
|
}
|