1 year ago · 895dfcea30
--- a/configs/training.py
+++ b/configs/training.py
@@ -7,7 +7,8 @@ from typing import ClassVar
 
				 @dataclass
			
 
				 class train_config:
			
 
				     model_name: str="PATH/to/LLAMA/7B"
			
 
				-    enable_fsdp: bool= False 
			
 
				+    enable_fsdp: bool=False
			
 
				+    low_cpu_fsdp: bool=False
			
 
				     run_validation: bool=True
			
 
				     batch_size_training: int=4
			
 
				     num_epochs: int=3
			
--- a/llama_finetuning.py
+++ b/llama_finetuning.py
@@ -93,10 +93,16 @@ def main(**kwargs):
 
				     gradient_accumulation_steps = train_config.batch_size_training // train_config.micro_batch_size
			
 
				      
			
 
				     # Load the pre-trained model and setup its configuration
			
 
				-    if train_config.enable_fsdp:
			
 
				-        # for FSDP, we save cpu memory by loading pretrained model on rank0 only.
			
 
				+    if train_config.enable_fsdp and train_config.low_cpu_fsdp:
			
 
				+        # for FSDP, we can save cpu memory by loading pretrained model on rank0 only.
			
 
				         # this avoids cpu oom when loading large models like llama 70B, in which case
			
 
				-        # model alone would consume 2+TB cpu mem (70 * 4 * 8)
			
 
				+        # model alone would consume 2+TB cpu mem (70 * 4 * 8). This will add some comms
			
 
				+        # overhead and currently requires latest nightly.
			
 
				+        v = packaging.version.parse(torch.__version__)
			
 
				+        verify_latest_nightly = v.is_devrelease and v.dev >= 20230701
			
 
				+        if not verify_latest_nightly:
			
 
				+            raise Exception("latest pytorch nightly build is required to run with low_cpu_fsdp config, "
			
 
				+                            "please install latest nightly.")
			
 
				         if rank == 0:
			
 
				             model = LlamaForCausalLM.from_pretrained(
			
 
				                 train_config.model_name,