hace 1 año · 6a78b96764
--- a/README.md
+++ b/README.md
--- a/docs/Dataset.md
+++ b/docs/Dataset.md
--- a/docs/LLM_finetuning.md
+++ b/docs/LLM_finetuning.md
--- a/docs/images/feature-based_FN_2.png
+++ b/docs/images/feature-based_FN_2.png
--- a/examples/Getting_to_know_Llama.ipynb
+++ b/examples/Getting_to_know_Llama.ipynb
--- a/examples/README.md
+++ b/examples/README.md
@@ -31,4 +31,8 @@ For more in depth information on inference including inference safety checks and
 
				 
			
 
				 **Note** The [sensitive topics safety checker](../src/llama_recipes/inference/safety_utils.py) utilizes AuditNLG which is an optional dependency. Please refer to installation section of the main [README.md](../README.md#install-with-optional-dependencies) for details.
			
 
				 
			
 
				-**Note** The **vLLM** example requires additional dependencies. Please refer to installation section of the main [README.md](../README.md#install-with-optional-dependencies) for details.
			
 
				+**Note** The **vLLM** example requires additional dependencies. Please refer to installation section of the main [README.md](../README.md#install-with-optional-dependencies) for details.
			
 
				+
			
 
				+## Train on custom dataset
			
 
				+To show how to train a model on a custom dataset we provide an example to generate a custom dataset in [custom_dataset.py](./custom_dataset.py).
			
 
				+The usage of the custom dataset is further described in the datasets [README](../docs/Dataset.md#training-on-custom-data).
			
--- a/examples/chat_completion/chat_completion.py
+++ b/examples/chat_completion/chat_completion.py
@@ -34,7 +34,7 @@ def main(
 
				     enable_azure_content_safety: bool=False, # Enable safety check with Azure content safety api
			
 
				     enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs
			
 
				     enable_saleforce_content_safety: bool=True, # Enable safety check woth Saleforce safety flan t5
			
 
				-    use_fast_kernels: bool = False, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
			
 
				+    use_fast_kernels: bool = False, # Enable using SDPA from PyTorch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
			
 
				     **kwargs
			
 
				 ):
			
 
				     if prompt_file is not None:
			
--- a/examples/custom_dataset.py
+++ b/examples/custom_dataset.py
@@ -0,0 +1,91 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+# For dataset details visit: https://huggingface.co/datasets/samsum
			
 
				+
			
 
				+import copy
			
 
				+import datasets
			
 
				+import itertools
			
 
				+
			
 
				+from llama_recipes.datasets.utils import Concatenator
			
 
				+
			
 
				+
			
 
				+B_INST, E_INST = "[INST]", "[/INST]"
			
 
				+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
			
 
				+
			
 
				+def tokenize_dialog(dialog, tokenizer):
			
 
				+    dialog_tokens = [
			
 
				+            tokenizer(
			
 
				+                f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ",
			
 
				+            )
			
 
				+            for prompt, answer in zip(dialog[::2], dialog[1::2])
			
 
				+        ]
			
 
				+    if len(dialog) % 2:    
			
 
				+        dialog_tokens += [tokenizer(
			
 
				+            f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}",
			
 
				+        )]
			
 
				+    
			
 
				+    combined_tokens = {}  
			
 
				+    for k in dialog_tokens[0].keys():
			
 
				+        combined_tokens[k] = list(itertools.chain(*(t[k] for t in dialog_tokens)))
			
 
				+    return combined_tokens
			
 
				+
			
 
				+
			
 
				+def get_custom_dataset(dataset_config, tokenizer, split):
			
 
				+    dataset = datasets.load_dataset("OpenAssistant/oasst1", split=split)
			
 
				+    
			
 
				+    dataset = dataset.map(lambda sample: {
			
 
				+        "message_id": sample["message_id"],
			
 
				+        "parent_id": sample["parent_id"],
			
 
				+        "text": sample["text"],
			
 
				+        },
			
 
				+        batched=True,
			
 
				+        remove_columns=list(dataset.features),)
			
 
				+    
			
 
				+    nodes = {}
			
 
				+    
			
 
				+    messages = {}
			
 
				+    root_ids = []
			
 
				+    
			
 
				+    for data in dataset:
			
 
				+        if data["parent_id"]:
			
 
				+            nodes[data["parent_id"]] = nodes.get(data["parent_id"], []) + [data["message_id"]]
			
 
				+        else:
			
 
				+            root_ids.append(data["message_id"])
			
 
				+        messages[data["message_id"]]=data["text"]
			
 
				+           
			
 
				+    def follow(thread, current_id):
			
 
				+        thread = copy.copy(thread) + [messages[current_id]]
			
 
				+        if current_id in nodes:
			
 
				+            new_threads = []
			
 
				+            for next_id in nodes[current_id]:
			
 
				+                new_threads += follow(thread, next_id)
			
 
				+            return new_threads
			
 
				+        else:
			
 
				+            return [thread]
			
 
				+        
			
 
				+    def get_threads_from_root(root_id):
			
 
				+        all_threads = []
			
 
				+        thread = [messages[root_id]]
			
 
				+        for cid in nodes[root_id]:
			
 
				+            all_threads += follow(thread, cid)
			
 
				+        return all_threads
			
 
				+            
			
 
				+    dataset = dataset.filter(lambda x: x["message_id"] in root_ids)
			
 
				+    dataset = dataset.map(lambda x: {"thread": get_threads_from_root(x["message_id"])}, remove_columns=list(dataset.features))
			
 
				+    dataset = dataset.map(lambda x: {"thread": [i for row in x["thread"] for i in row]}, batched=True)
			
 
				+    
			
 
				+    def to_dialog(thread):
			
 
				+        dialog = []
			
 
				+        for i, content in enumerate(thread):
			
 
				+            dialog.append({
			
 
				+                "role": "user" if i % 2 == 0 else "assistant",
			
 
				+                "content": content,
			
 
				+            })
			
 
				+        return {"dialog": dialog}
			
 
				+            
			
 
				+    dataset = dataset.map(lambda x: to_dialog(x["thread"]), remove_columns=list(dataset.features))
			
 
				+    dataset = dataset.map(lambda x: tokenize_dialog(x["dialog"], tokenizer), remove_columns=list(dataset.features))
			
 
				+    dataset = dataset.map(Concatenator(), batched=True)
			
 
				+    
			
 
				+    return dataset
			
--- a/examples/inference.py
+++ b/examples/inference.py
@@ -76,13 +76,7 @@ def main(
 
				             print("Module 'optimum' not found. Please install 'optimum' it before proceeding.")
			
 
				 
			
 
				     tokenizer = LlamaTokenizer.from_pretrained(model_name)
			
 
				-    tokenizer.add_special_tokens(
			
 
				-        {
			
 
				-         
			
 
				-            "pad_token": "<PAD>",
			
 
				-        }
			
 
				-    )
			
 
				-    model.resize_token_embeddings(model.config.vocab_size + 1) 
			
 
				+    tokenizer.pad_token = tokenizer.eos_token
			
 
				     
			
 
				     safety_checker = get_safety_checker(enable_azure_content_safety,
			
 
				                                         enable_sensitive_topics,
			
--- a/examples/quickstart.ipynb
+++ b/examples/quickstart.ipynb
@@ -32,7 +32,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "# %%bash\n",
			
 
				-    "# pip install transformers datasets accelerate sentencepiece protobuf==3.20 py7zr scipy peft bitsandbytes fire torch_tb_profiler ipywidgets\n",
			
 
				+    "# pip install llama-recipes transformers datasets accelerate sentencepiece protobuf==3.20 py7zr scipy peft bitsandbytes fire torch_tb_profiler ipywidgets\n",
			
 
				     "# TRANSFORM=`python -c \"import transformers;print('/'.join(transformers.__file__.split('/')[:-1])+'/models/llama/convert_llama_weights_to_hf.py')\"`\n",
			
 
				     "# python ${TRANSFORM} --input_dir models --model_size 7B --output_dir models_hf/7B"
			
 
				    ]
			
@@ -130,11 +130,8 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "from pathlib import Path\n",
			
 
				-    "import os\n",
			
 
				-    "import sys\n",
			
 
				-    "from utils.dataset_utils import get_preprocessed_dataset\n",
			
 
				-    "from configs.datasets import samsum_dataset\n",
			
 
				+    "from llama_recipes.utils.dataset_utils import get_preprocessed_dataset\n",
			
 
				+    "from llama_recipes.configs.datasets import samsum_dataset\n",
			
 
				     "\n",
			
 
				     "train_dataset = get_preprocessed_dataset(tokenizer, samsum_dataset, 'train')"
			
 
				    ]
			
--- a/scripts/spellcheck_conf/wordlist.txt
+++ b/scripts/spellcheck_conf/wordlist.txt
@@ -1147,4 +1147,13 @@ HuggingFace's
 
				 LoRA
			
 
				 bitsandbytes
			
 
				 CLA
			
 
				-dialogs
			
 
				+dialogs
			
 
				+OpenAssistant
			
 
				+oasst1
			
 
				+oasst
			
 
				+AdamW
			
 
				+Autocast
			
 
				+FN
			
 
				+GBs
			
 
				+MLP
			
 
				+learnable
			
--- a/src/llama_recipes/configs/datasets.py
+++ b/src/llama_recipes/configs/datasets.py
@@ -25,4 +25,12 @@ class alpaca_dataset:
 
				     dataset: str = "alpaca_dataset"
			
 
				     train_split: str = "train"
			
 
				     test_split: str = "val"
			
 
				-    data_path: str = "src/llama_recipes/datasets/alpaca_data.json"
			
 
				+    data_path: str = "src/llama_recipes/datasets/alpaca_data.json"
			
 
				+    
			
 
				+    
			
 
				+@dataclass
			
 
				+class custom_dataset:
			
 
				+    dataset: str = "custom_dataset"
			
 
				+    file: str = "examples/custom_dataset.py"
			
 
				+    train_split: str = "train"
			
 
				+    test_split: str = "validation"
			
--- a/src/llama_recipes/configs/fsdp.py
+++ b/src/llama_recipes/configs/fsdp.py
@@ -13,8 +13,7 @@ class fsdp_config:
 
				     sharding_strategy: ShardingStrategy = ShardingStrategy.FULL_SHARD
			
 
				     checkpoint_type: StateDictType = StateDictType.SHARDED_STATE_DICT  # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size.
			
 
				     fsdp_activation_checkpointing: bool=True
			
 
				+    fsdp_cpu_offload: bool=False
			
 
				     pure_bf16: bool = False
			
 
				     optimizer: str= "AdamW"
			
 
				     
			
 
				-    
			
 
				-    
			
--- a/src/llama_recipes/configs/peft.py
+++ b/src/llama_recipes/configs/peft.py
@@ -1,14 +1,14 @@
 
				 # Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				 
			
 
				-from dataclasses import dataclass
			
 
				-from typing import ClassVar, List
			
 
				+from dataclasses import dataclass, field
			
 
				+from typing import List
			
 
				 
			
 
				 @dataclass
			
 
				 class lora_config:
			
 
				      r: int=8
			
 
				      lora_alpha: int=32
			
 
				-     target_modules: ClassVar[List[str]]= ["q_proj", "v_proj"]
			
 
				+     target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"])
			
 
				      bias= "none"
			
 
				      task_type: str= "CAUSAL_LM"
			
 
				      lora_dropout: float=0.05
			
--- a/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb
+++ b/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb
@@ -35,10 +35,10 @@
 
				     "  (\" '\", \"'\"),\n",
			
 
				     "  (\" ?\", \"?\"),\n",
			
 
				     "  (\" !\", \"!\"),\n",
			
 
				-    "  (\" :\", \"!\"),\n",
			
 
				-    "  (\" ;\", \"!\"),\n",
			
 
				+    "  (\" :\", \":\"),\n",
			
 
				+    "  (\" ;\", \";\"),\n",
			
 
				     "  (\" n't\", \"n't\"),\n",
			
 
				-    "  (\" v\", \"n't\"),\n",
			
 
				+    "  (\" v\", \"v\"),\n",
			
 
				     "  (\"2 0 0 6\", \"2006\"),\n",
			
 
				     "  (\"5 5\", \"55\"),\n",
			
 
				     "  (\"4 0 0\", \"400\"),\n",
			
--- a/src/llama_recipes/datasets/utils.py
+++ b/src/llama_recipes/datasets/utils.py
@@ -52,7 +52,7 @@ class ConcatDataset(Dataset):
 
				             "labels": [],
			
 
				             }
			
 
				         
			
 
				-        for sample in tqdm(self.dataset, desc="Preprocessing dataset"):
			
 
				+        for sample in tqdm(self.dataset, desc="Preprocessing dataset", dynamic_ncols=True):
			
 
				             buffer = {k: v + sample[k] for k,v in buffer.items()}
			
 
				             
			
 
				             while len(next(iter(buffer.values()))) > self.chunk_size:
			
@@ -63,4 +63,4 @@ class ConcatDataset(Dataset):
 
				         return self.samples[idx]
			
 
				     
			
 
				     def __len__(self):
			
 
				-        return len(self.samples)
			
 
				+        return len(self.samples)
			
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -12,6 +12,7 @@ from peft import get_peft_model, prepare_model_for_int8_training
 
				 from torch.distributed.fsdp import (
			
 
				     FullyShardedDataParallel as FSDP,
			
 
				 )
			
 
				+from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
			
 
				 from torch.optim.lr_scheduler import StepLR
			
 
				 from torch.utils.data import DistributedSampler
			
 
				 from transformers import (
			
@@ -150,6 +151,7 @@ def main(**kwargs):
 
				         model = FSDP(
			
 
				             model,
			
 
				             auto_wrap_policy= my_auto_wrapping_policy if train_config.use_peft else wrapping_policy,
			
 
				+            cpu_offload=CPUOffload(offload_params=True) if fsdp_config.fsdp_cpu_offload else None,
			
 
				             mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None,
			
 
				             sharding_strategy=fsdp_config.sharding_strategy,
			
 
				             device_id=torch.xpu.current_device() if is_xpu_available() else torch.cuda.current_device(),
			
@@ -233,12 +235,13 @@ def main(**kwargs):
 
				             momentum_dtype=torch.bfloat16,
			
 
				             variance_dtype=torch.bfloat16,
			
 
				             use_kahan_summation=False,
			
 
				+            weight_decay=train_config.weight_decay,
			
 
				         )
			
 
				     else:
			
 
				         optimizer = optim.AdamW(
			
 
				             model.parameters(),
			
 
				             lr=train_config.lr,
			
 
				-            weight_decay=0.0,
			
 
				+            weight_decay=train_config.weight_decay,
			
 
				         )
			
 
				     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
			
 
				 
			
--- a/src/llama_recipes/inference/chat_utils.py
+++ b/src/llama_recipes/inference/chat_utils.py
@@ -44,7 +44,7 @@ def format_tokens(dialogs, tokenizer):
 
				             [
			
 
				                 tokenizer.encode(
			
 
				                     f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ",
			
 
				-                )
			
 
				+                ) + [tokenizer.eos_token_id]
			
 
				                 for prompt, answer in zip(dialog[::2], dialog[1::2])
			
 
				             ],
			
 
				             [],
			
@@ -62,4 +62,4 @@ def format_tokens(dialogs, tokenizer):
 
				 def read_dialogs_from_file(file_path):
			
 
				     with open(file_path, 'r') as file:
			
 
				         dialogs = json.load(file)
			
 
				-    return dialogs
			
 
				+    return dialogs
			
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -2,8 +2,7 @@
 
				 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				 
			
 
				 import inspect
			
 
				-from dataclasses import fields
			
 
				-
			
 
				+from dataclasses import asdict
			
 
				 from peft import (
			
 
				     LoraConfig,
			
 
				     AdaptionPromptConfig,
			
@@ -42,9 +41,10 @@ def generate_peft_config(train_config, kwargs):
 
				     
			
 
				     assert train_config.peft_method in names, f"Peft config not found: {train_config.peft_method}"
			
 
				     
			
 
				-    config = configs[names.index(train_config.peft_method)]
			
 
				+    config = configs[names.index(train_config.peft_method)]()
			
 
				+    
			
 
				     update_config(config, **kwargs)
			
 
				-    params = {k.name: getattr(config, k.name) for k in fields(config)}
			
 
				+    params = asdict(config)
			
 
				     peft_config = peft_configs[names.index(train_config.peft_method)](**params)
			
 
				     
			
 
				     return peft_config
			
@@ -52,10 +52,11 @@ def generate_peft_config(train_config, kwargs):
 
				 
			
 
				 def generate_dataset_config(train_config, kwargs):
			
 
				     names = tuple(DATASET_PREPROC.keys())
			
 
				-    
			
 
				+        
			
 
				     assert train_config.dataset in names, f"Unknown dataset: {train_config.dataset}"
			
 
				     
			
 
				-    dataset_config = {k:v for k, v in inspect.getmembers(datasets)}[train_config.dataset]
			
 
				+    dataset_config = {k:v for k, v in inspect.getmembers(datasets)}[train_config.dataset]()
			
 
				+        
			
 
				     update_config(dataset_config, **kwargs)
			
 
				     
			
 
				     return  dataset_config
			
--- a/src/llama_recipes/utils/dataset_utils.py
+++ b/src/llama_recipes/utils/dataset_utils.py
@@ -1,7 +1,9 @@
 
				 # Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				 
			
 
				+import importlib
			
 
				 from functools import partial
			
 
				+from pathlib import Path
			
 
				 
			
 
				 import torch
			
 
				 
			
@@ -12,10 +14,46 @@ from llama_recipes.datasets import (
 
				 )
			
 
				 
			
 
				 
			
 
				+def load_module_from_py_file(py_file: str) -> object:
			
 
				+    """
			
 
				+    This method loads a module from a py file which is not in the Python path
			
 
				+    """
			
 
				+    module_name = Path(py_file).name
			
 
				+    loader = importlib.machinery.SourceFileLoader(module_name, py_file)
			
 
				+    spec = importlib.util.spec_from_loader(module_name, loader)
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+
			
 
				+    loader.exec_module(module)
			
 
				+
			
 
				+    return module
			
 
				+
			
 
				+
			
 
				+def get_custom_dataset(dataset_config, tokenizer, split: str):
			
 
				+    if ":" in dataset_config.file:
			
 
				+        module_path, func_name = dataset_config.file.split(":")
			
 
				+    else:
			
 
				+        module_path, func_name = dataset_config.file, "get_custom_dataset"
			
 
				+        
			
 
				+    if not module_path.endswith(".py"):
			
 
				+        raise ValueError(f"Dataset file {module_path} is not a .py file.")
			
 
				+    
			
 
				+    module_path = Path(module_path)
			
 
				+    if not module_path.is_file():
			
 
				+        raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
			
 
				+    
			
 
				+    module = load_module_from_py_file(module_path.as_posix())
			
 
				+    try:
			
 
				+        return getattr(module, func_name)(dataset_config, tokenizer, split)
			
 
				+    except AttributeError as e:
			
 
				+        print(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).")
			
 
				+        raise e
			
 
				+    
			
 
				+
			
 
				 DATASET_PREPROC = {
			
 
				     "alpaca_dataset": partial(get_alpaca_dataset, max_words=224),
			
 
				     "grammar_dataset": get_grammar_dataset,
			
 
				     "samsum_dataset": get_samsum_dataset,
			
 
				+    "custom_dataset": get_custom_dataset,
			
 
				 }
			
 
				 
			
 
				 
			
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -4,6 +4,7 @@
 
				 import os
			
 
				 import time
			
 
				 import yaml
			
 
				+from contextlib import nullcontext
			
 
				 from pathlib import Path
			
 
				 from pkg_resources import packaging
			
 
				 
			
@@ -56,7 +57,9 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				     elif train_config.use_fp16 and not train_config.enable_fsdp:
			
 
				         scaler = torch.cuda.amp.GradScaler() 
			
 
				     if train_config.enable_fsdp:
			
 
				-        world_size = int(os.environ["WORLD_SIZE"]) 
			
 
				+        world_size = int(os.environ["WORLD_SIZE"])
			
 
				+    autocast = torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext
			
 
				+    
			
 
				     train_prep = []
			
 
				     train_loss = []
			
 
				     val_prep = []
			
@@ -71,17 +74,21 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				             model.train()
			
 
				             total_loss = 0.0
			
 
				             total_length = len(train_dataloader)//gradient_accumulation_steps
			
 
				-            pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch}", total=total_length)
			
 
				+            pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch+1}", total=total_length, dynamic_ncols=True)
			
 
				             for step, batch in enumerate(train_dataloader):
			
 
				                 for key in batch.keys():
			
 
				                     if train_config.enable_fsdp:
			
 
				-                        batch[key] = batch[key].to(local_rank)
			
 
				+                        if is_xpu_available():
			
 
				+                            batch[key] = batch[key].to(torch.device(f"xpu:{local_rank}"))
			
 
				+                        else:
			
 
				+                            batch[key] = batch[key].to(local_rank)
			
 
				                     else:
			
 
				                         if is_xpu_available():
			
 
				                             batch[key] = batch[key].to('xpu:0')
			
 
				                         else:
			
 
				                             batch[key] = batch[key].to('cuda:0')              
			
 
				-                loss = model(**batch).loss
			
 
				+                with autocast():
			
 
				+                    loss = model(**batch).loss
			
 
				                 loss = loss / gradient_accumulation_steps
			
 
				                 total_loss += loss.detach().float()
			
 
				                 if train_config.use_fp16:
			
@@ -91,16 +98,17 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				                         scaler.step(optimizer)
			
 
				                         scaler.update()
			
 
				                         optimizer.zero_grad()
			
 
				-                        pbar.update(step//gradient_accumulation_steps)
			
 
				+                        pbar.update(1)
			
 
				                 else:
			
 
				                     # regular backpropagation when fp16 is not used
			
 
				                     loss.backward()
			
 
				                     if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
			
 
				                         optimizer.step()
			
 
				                         optimizer.zero_grad()
			
 
				-                        pbar.update(step//gradient_accumulation_steps)
			
 
				-                
			
 
				-                pbar.set_description(f"Training Epoch: {epoch}/{train_config.num_epochs}, step {step}/{len(train_dataloader)} completed (loss: {loss.detach().float()})")
			
 
				+                        pbar.update(1)
			
 
				+
			
 
				+                pbar.set_description(f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step}/{len(train_dataloader)} completed (loss: {loss.detach().float()})")
			
 
				+            pbar.close()
			
 
				                 
			
 
				         epoch_end_time = time.perf_counter()-epoch_start_time
			
 
				         epoch_times.append(epoch_end_time)    
			
@@ -195,16 +203,16 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				                 best_val_loss = eval_epoch_loss
			
 
				                 if train_config.enable_fsdp:
			
 
				                     if rank==0:
			
 
				-                        print(f"best eval loss on epoch {epoch} is {best_val_loss}")
			
 
				+                        print(f"best eval loss on epoch {epoch+1} is {best_val_loss}")
			
 
				                 else:
			
 
				-                    print(f"best eval loss on epoch {epoch} is {best_val_loss}")
			
 
				+                    print(f"best eval loss on epoch {epoch+1} is {best_val_loss}")
			
 
				             val_loss.append(best_val_loss)
			
 
				             val_prep.append(eval_ppl)
			
 
				         if train_config.enable_fsdp:
			
 
				             if rank==0:
			
 
				-                print(f"Epoch {epoch+1}: train_perplexity={train_perplexity:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epcoh time {epoch_end_time}s")
			
 
				+                print(f"Epoch {epoch+1}: train_perplexity={train_perplexity:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s")
			
 
				         else:
			
 
				-            print(f"Epoch {epoch+1}: train_perplexity={train_perplexity:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epcoh time {epoch_end_time}s")
			
 
				+            print(f"Epoch {epoch+1}: train_perplexity={train_perplexity:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s")
			
 
				     avg_epoch_time = sum(epoch_times)/ len(epoch_times)
			
 
				     avg_checkpoint_time = sum(checkpoint_times)/ len(checkpoint_times) if len(checkpoint_times) > 0 else 0
			
 
				     avg_train_prep = sum(train_prep)/len(train_prep)
			
@@ -245,7 +253,7 @@ def evaluation(model,train_config, eval_dataloader, local_rank, tokenizer):
 
				     eval_preds = []
			
 
				     eval_loss = 0.0  # Initialize evaluation loss
			
 
				     with MemoryTrace() as memtrace:
			
 
				-        for step, batch in enumerate(tqdm(eval_dataloader,colour="green", desc="evaluating Epoch")):
			
 
				+        for step, batch in enumerate(tqdm(eval_dataloader,colour="green", desc="evaluating Epoch", dynamic_ncols=True)):
			
 
				             for key in batch.keys():
			
 
				                 if train_config.enable_fsdp:
			
 
				                     batch[key] = batch[key].to(local_rank)
			
--- a/tests/datasets/test_custom_dataset.py
+++ b/tests/datasets/test_custom_dataset.py
@@ -0,0 +1,70 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+import pytest
			
 
				+from unittest.mock import patch
			
 
				+
			
 
				+
			
 
				+@patch('llama_recipes.finetuning.train')
			
 
				+@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				+@patch('llama_recipes.finetuning.optim.AdamW')
			
 
				+@patch('llama_recipes.finetuning.StepLR')
			
 
				+def test_custom_dataset(step_lr, optimizer, get_model, train, mocker):
			
 
				+    from llama_recipes.finetuning import main
			
 
				+
			
 
				+    kwargs = {
			
 
				+        "dataset": "custom_dataset",
			
 
				+        "model_name": "decapoda-research/llama-7b-hf", # We use the tokenizer as a surrogate for llama2 tokenizer here
			
 
				+        "custom_dataset.file": "examples/custom_dataset.py",
			
 
				+        "custom_dataset.train_split": "validation",
			
 
				+        "batch_size_training": 2,
			
 
				+        "use_peft": False,
			
 
				+        }
			
 
				+
			
 
				+    main(**kwargs)
			
 
				+
			
 
				+    assert train.call_count == 1
			
 
				+
			
 
				+    args, kwargs = train.call_args
			
 
				+    train_dataloader = args[1]
			
 
				+    eval_dataloader = args[2]
			
 
				+    tokenizer = args[3]
			
 
				+
			
 
				+    assert len(train_dataloader) == 226
			
 
				+    assert len(eval_dataloader) == 2*226
			
 
				+
			
 
				+    it = iter(train_dataloader)
			
 
				+    STRING = tokenizer.decode(next(it)["input_ids"][0], skip_special_tokens=True)
			
 
				+    EXPECTED_STRING = "[INST] Напиши функцию на языке swift, которая сортирует массив целых чисел, а затем выводит его на экран [/INST] Вот функция, "
			
 
				+
			
 
				+    assert STRING.startswith(EXPECTED_STRING)
			
 
				+
			
 
				+    next(it)
			
 
				+    next(it)
			
 
				+    next(it)
			
 
				+    STRING = tokenizer.decode(next(it)["input_ids"][0], skip_special_tokens=True)
			
 
				+    EXPECTED_SUBSTRING_1 = "Therefore you are correct.  [INST] How can L’Hopital’s Rule be"
			
 
				+    EXPECTED_SUBSTRING_2 = "a circular path around the turn.  [INST] How on earth is that related to L’Hopital’s Rule?"
			
 
				+
			
 
				+    assert EXPECTED_SUBSTRING_1 in STRING
			
 
				+    assert EXPECTED_SUBSTRING_2 in STRING
			
 
				+
			
 
				+
			
 
				+@patch('llama_recipes.finetuning.train')
			
 
				+@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				+@patch('llama_recipes.finetuning.LlamaTokenizer.from_pretrained')
			
 
				+@patch('llama_recipes.finetuning.optim.AdamW')
			
 
				+@patch('llama_recipes.finetuning.StepLR')
			
 
				+def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train, mocker):
			
 
				+    from llama_recipes.finetuning import main
			
 
				+
			
 
				+    tokenizer.return_value = mocker.MagicMock(side_effect=lambda x: {"input_ids":[len(x)*[0,]], "attention_mask": [len(x)*[0,]]})
			
 
				+
			
 
				+    kwargs = {
			
 
				+        "dataset": "custom_dataset",
			
 
				+        "custom_dataset.file": "examples/custom_dataset.py:get_unknown_dataset",
			
 
				+        "batch_size_training": 1,
			
 
				+        "use_peft": False,
			
 
				+        }
			
 
				+    with pytest.raises(AttributeError):
			
 
				+        main(**kwargs)
			
--- a/tests/datasets/test_samsum_datasets.py
+++ b/tests/datasets/test_samsum_datasets.py
@@ -0,0 +1,37 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+from unittest.mock import patch
			
 
				+
			
 
				+
			
 
				+@patch('llama_recipes.finetuning.train')
			
 
				+@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				+@patch('llama_recipes.finetuning.LlamaTokenizer.from_pretrained')
			
 
				+@patch('llama_recipes.finetuning.optim.AdamW')
			
 
				+@patch('llama_recipes.finetuning.StepLR')
			
 
				+def test_custom_dataset(step_lr, optimizer, tokenizer, get_model, train, mocker):
			
 
				+    from llama_recipes.finetuning import main
			
 
				+        
			
 
				+    tokenizer.return_value = mocker.MagicMock(side_effect=lambda x: {"input_ids":[len(x)*[0,]], "attention_mask": [len(x)*[0,]]})
			
 
				+    
			
 
				+    
			
 
				+    kwargs = {
			
 
				+        "batch_size_training": 1,
			
 
				+        "use_peft": False,
			
 
				+        "dataset": "samsum_dataset",
			
 
				+        }
			
 
				+    
			
 
				+    main(**kwargs)
			
 
				+    
			
 
				+    assert train.call_count == 1
			
 
				+    
			
 
				+    args, kwargs = train.call_args
			
 
				+    train_dataloader = args[1]
			
 
				+    eval_dataloader = args[2]
			
 
				+    
			
 
				+    VAL_SAMPLES = 818
			
 
				+    TRAIN_SAMPLES = 14732
			
 
				+    CONCAT_SIZE = 2048
			
 
				+    assert len(train_dataloader) == TRAIN_SAMPLES // CONCAT_SIZE
			
 
				+    assert len(eval_dataloader) == VAL_SAMPLES
			
 
				+    
			
--- a/tests/test_finetuning.py
+++ b/tests/test_finetuning.py
@@ -1,6 +1,11 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+from pytest import approx
			
 
				 from unittest.mock import patch
			
 
				-import importlib
			
 
				 
			
 
				+from torch.nn import Linear
			
 
				+from torch.optim import AdamW
			
 
				 from torch.utils.data.dataloader import DataLoader
			
 
				 
			
 
				 from llama_recipes.finetuning import main
			
@@ -69,4 +74,33 @@ def test_finetuning_peft(step_lr, optimizer, get_peft_model, gen_peft_config, ge
 
				     main(**kwargs)
			
 
				     
			
 
				     assert get_peft_model.return_value.to.call_args.args[0] == "cuda"
			
 
				-    assert get_peft_model.return_value.print_trainable_parameters.call_count == 1
			
 
				+    assert get_peft_model.return_value.print_trainable_parameters.call_count == 1
			
 
				+    
			
 
				+    
			
 
				+@patch('llama_recipes.finetuning.train')
			
 
				+@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				+@patch('llama_recipes.finetuning.LlamaTokenizer.from_pretrained')
			
 
				+@patch('llama_recipes.finetuning.get_preprocessed_dataset')
			
 
				+@patch('llama_recipes.finetuning.get_peft_model')
			
 
				+@patch('llama_recipes.finetuning.StepLR')
			
 
				+def test_finetuning_weight_decay(step_lr, get_peft_model, get_dataset, tokenizer, get_model, train, mocker):
			
 
				+    kwargs = {"weight_decay": 0.01}
			
 
				+    
			
 
				+    get_dataset.return_value = [1]
			
 
				+    
			
 
				+    model = mocker.MagicMock(name="model")
			
 
				+    model.parameters.return_value = Linear(1,1).parameters()
			
 
				+    get_peft_model.return_value = model 
			
 
				+    get_peft_model.return_value.print_trainable_parameters=lambda:None
			
 
				+    main(**kwargs)
			
 
				+    
			
 
				+    assert train.call_count == 1
			
 
				+    
			
 
				+    args, kwargs = train.call_args
			
 
				+    optimizer = args[4]
			
 
				+    
			
 
				+    print(optimizer.state_dict())
			
 
				+    
			
 
				+    assert isinstance(optimizer, AdamW)
			
 
				+    assert optimizer.state_dict()["param_groups"][0]["weight_decay"] == approx(0.01)
			
 
				+    
			
--- a/tests/test_train_utils.py
+++ b/tests/test_train_utils.py
@@ -1,14 +1,22 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+from unittest.mock import patch
			
 
				+
			
 
				 import torch
			
 
				 
			
 
				 from llama_recipes.utils.train_utils import train
			
 
				 
			
 
				-def test_gradient_accumulation(mocker):
			
 
				-    # import sys
			
 
				-    # sys.path.append('/home/ubuntu/llama-recipes/')
			
 
				+@patch("llama_recipes.utils.train_utils.MemoryTrace")
			
 
				+@patch("llama_recipes.utils.train_utils.nullcontext")
			
 
				+@patch("llama_recipes.utils.train_utils.torch.cuda.amp.GradScaler")
			
 
				+@patch("llama_recipes.utils.train_utils.torch.cuda.amp.autocast")
			
 
				+def test_gradient_accumulation(autocast, scaler, nullcontext, mem_trace, mocker):
			
 
				     
			
 
				     model = mocker.MagicMock(name="model")
			
 
				     model().loss.__truediv__().detach.return_value = torch.tensor(1)
			
 
				-    batch = {"input": torch.zeros(1)}
			
 
				+    mock_tensor = mocker.MagicMock(name="tensor")
			
 
				+    batch = {"input": mock_tensor}
			
 
				     train_dataloader = [batch, batch, batch, batch, batch]
			
 
				     eval_dataloader = None
			
 
				     tokenizer = mocker.MagicMock()
			
@@ -34,7 +42,13 @@ def test_gradient_accumulation(mocker):
 
				     assert optimizer.zero_grad.call_count == 5
			
 
				     optimizer.zero_grad.reset_mock()
			
 
				     
			
 
				+    assert nullcontext.call_count == 5
			
 
				+    nullcontext.reset_mock()
			
 
				+    
			
 
				+    assert autocast.call_count == 0
			
 
				+    
			
 
				     gradient_accumulation_steps = 2
			
 
				+    train_config.use_fp16 = True
			
 
				     train(
			
 
				         model,
			
 
				         train_dataloader,
			
@@ -45,4 +59,6 @@ def test_gradient_accumulation(mocker):
 
				         gradient_accumulation_steps,
			
 
				         train_config,
			
 
				     )
			
 
				-    assert optimizer.zero_grad.call_count == 3
			
 
				+    assert optimizer.zero_grad.call_count == 3
			
 
				+    assert nullcontext.call_count == 0
			
 
				+    assert autocast.call_count == 5