1 year ago · c5a382e509
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -57,8 +57,6 @@ def main(**kwargs):
 
																     # Set the seeds for reproducibility
															
 
																     if is_xpu_available():
															
 
																         torch.xpu.manual_seed(train_config.seed)
															
 
																-    else:
															
 
																-        torch.cuda.manual_seed(train_config.seed)
															
 
																     torch.manual_seed(train_config.seed)
															
 
																     random.seed(train_config.seed)
															
@@ -72,7 +70,7 @@ def main(**kwargs):
 
																     if torch.distributed.is_initialized():
															
 
																         if is_xpu_available():
															
 
																             torch.xpu.set_device(local_rank)
															
 
																-        else:
															
 
																+        elif torch.cuda.is_available():
															
 
																             torch.cuda.set_device(local_rank)
															
 
																         clear_gpu_cache(local_rank)
															
 
																         setup_environ_flags(rank)
															
@@ -135,7 +133,7 @@ def main(**kwargs):
 
																     hsdp_device_mesh = None
															
 
																     if fsdp_config.hsdp and fsdp_config.sharding_strategy == ShardingStrategy.HYBRID_SHARD:
															
 
																-        hsdp_device_mesh = hdsp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size)
															
 
																+        hsdp_device_mesh = hsdp_device_mesh(replica_group_size=fsdp_config.replica_group_size, sharding_group_size=fsdp_config.sharding_group_size)
															
 
																         print("HSDP device mesh is ready")
															
 
																     #setting up FSDP if enable_fsdp is enabled
															
@@ -146,6 +144,12 @@ def main(**kwargs):
 
																         mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank)
															
 
																         my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, LlamaDecoderLayer)
															
 
																+        
															
 
																+        device_id = 0
															
 
																+        if is_xpu_available():
															
 
																+            device_id = torch.xpu.current_device()
															
 
																+        elif torch.cuda.is_available():
															
 
																+            device_id = torch.cuda.current_device()
															
 
																         model = FSDP(
															
 
																             model,
															
@@ -154,7 +158,7 @@ def main(**kwargs):
 
																             mixed_precision=mixed_precision_policy if not fsdp_config.pure_bf16 else None,
															
 
																             sharding_strategy=fsdp_config.sharding_strategy,
															
 
																             device_mesh=hsdp_device_mesh,
															
 
																-            device_id=torch.xpu.current_device() if is_xpu_available() else torch.cuda.current_device(),
															
 
																+            device_id=device_id,
															
 
																             limit_all_gathers=True,
															
 
																             sync_module_states=train_config.low_cpu_fsdp,
															
 
																             param_init_fn=lambda module: module.to_empty(device=torch.device("cuda"), recurse=False)
															
@@ -165,7 +169,7 @@ def main(**kwargs):
 
																     elif not train_config.quantization and not train_config.enable_fsdp:
															
 
																         if is_xpu_available():
															
 
																             model.to("xpu:0")
															
 
																-        else:
															
 
																+        elif torch.cuda.is_available():
															
 
																             model.to("cuda")
															
 
																     dataset_config = generate_dataset_config(train_config, kwargs)
															
--- a/src/llama_recipes/utils/memory_utils.py
+++ b/src/llama_recipes/utils/memory_utils.py
@@ -56,21 +56,21 @@ class MemoryTrace:
 
																             self.peak = byte2gb(torch.xpu.max_memory_allocated())
															
 
																             xpu_info = torch.xpu.memory_stats()
															
 
																             self.peak_active_gb = byte2gb(xpu_info["active_bytes.all.peak"])
															
 
																-            self.xpu_malloc_retires = xpu_info.get("num_alloc_retries", 0)
															
 
																+            self.malloc_retries = xpu_info.get("num_alloc_retries", 0)
															
 
																             self.peak_active_gb = byte2gb(xpu_info["active_bytes.all.peak"])
															
 
																-            self.m_xpu_ooms = xpu_info.get("num_ooms", 0)
															
 
																+            self.m_ooms = xpu_info.get("num_ooms", 0)
															
 
																             self.used = byte2gb(self.end - self.begin)
															
 
																             self.peaked = byte2gb(self.peak - self.begin)
															
 
																             self.max_reserved = byte2gb(torch.xpu.max_memory_reserved())
															
 
																-        else:
															
 
																+        elif torch.cuda.is_available():
															
 
																             torch.cuda.empty_cache()
															
 
																             self.end = byte2gb(torch.cuda.memory_allocated())
															
 
																             self.peak = byte2gb(torch.cuda.max_memory_allocated())
															
 
																             cuda_info = torch.cuda.memory_stats()
															
 
																             self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"])
															
 
																-            self.cuda_malloc_retires = cuda_info.get("num_alloc_retries", 0)
															
 
																+            self.malloc_retries = cuda_info.get("num_alloc_retries", 0)
															
 
																             self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"])
															
 
																-            self.m_cuda_ooms = cuda_info.get("num_ooms", 0)
															
 
																+            self.m_ooms = cuda_info.get("num_ooms", 0)
															
 
																             self.used = byte2gb(self.end - self.begin)
															
 
																             self.peaked = byte2gb(self.peak - self.begin)
															
 
																             self.max_reserved = byte2gb(torch.cuda.max_memory_reserved())
															
@@ -78,4 +78,18 @@ class MemoryTrace:
 
																         self.cpu_end = self.cpu_mem_used()
															
 
																         self.cpu_used = byte2gb(self.cpu_end - self.cpu_begin)
															
 
																         self.cpu_peaked = byte2gb(self.cpu_peak - self.cpu_begin)
															
 
																-        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
															
 
																+        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
															
 
																+        
															
 
																+    def print_stats(self):
															
 
																+        device_str = None
															
 
																+        if is_xpu_available():
															
 
																+            device_str = "XPU"
															
 
																+        elif torch.cuda.is_available():
															
 
																+            device_str = "CUDA"
															
 
																+            
															
 
																+        if device_str:
															
 
																+            print(f"Max {device_str} memory allocated was {self.peak} GB")
															
 
																+            print(f"Max {device_str} memory reserved was {self.max_reserved} GB")
															
 
																+            print(f"Peak active {device_str} memory was {self.peak_active_gb} GB")
															
 
																+            print(f"{device_str} Malloc retries : {self.malloc_retries}")
															
 
																+        print(f"CPU Total Peak Memory consumed during the train (max): {self.cpu_peaked + self.cpu_begin} GB")
															
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -154,31 +154,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
																         train_prep.append(float(train_perplexity))
															
 
																         train_loss.append(float(train_epoch_loss))
															
 
																-        if train_config.enable_fsdp:
															
 
																-            if rank==0:
															
 
																-                if is_xpu_available():
															
 
																-                    print(f"Max XPU memory allocated was {memtrace.peak} GB")
															
 
																-                    print(f"Max XPU memory reserved was {memtrace.max_reserved} GB")
															
 
																-                    print(f"Peak active XPU memory was {memtrace.peak_active_gb} GB")
															
 
																-                    print(f"Xpu Malloc retires : {memtrace.xpu_malloc_retires}")
															
 
																-                else:
															
 
																-                    print(f"Max CUDA memory allocated was {memtrace.peak} GB")
															
 
																-                    print(f"Max CUDA memory reserved was {memtrace.max_reserved} GB")
															
 
																-                    print(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB")
															
 
																-                    print(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}")
															
 
																-                print(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB")
															
 
																-        else:
															
 
																-            if is_xpu_available():
															
 
																-                print(f"Max XPU memory allocated was {memtrace.peak} GB")
															
 
																-                print(f"Max XPU memory reserved was {memtrace.max_reserved} GB")
															
 
																-                print(f"Peak active XPU memory was {memtrace.peak_active_gb} GB")
															
 
																-                print(f"Xpu Malloc retires : {memtrace.xpu_malloc_retires}")
															
 
																-            else:
															
 
																-                print(f"Max CUDA memory allocated was {memtrace.peak} GB")
															
 
																-                print(f"Max CUDA memory reserved was {memtrace.max_reserved} GB")
															
 
																-                print(f"Peak active CUDA memory was {memtrace.peak_active_gb} GB")
															
 
																-                print(f"Cuda Malloc retires : {memtrace.cuda_malloc_retires}")
															
 
																-            print(f"CPU Total Peak Memory consumed during the train (max): {memtrace.cpu_peaked + memtrace.cpu_begin} GB")
															
 
																+        if not train_config.enable_fsdp or rank==0:
															
 
																+            memtrace.print_stats()
															
 
																         # Update the learning rate as needed
															
 
																         lr_scheduler.step()
															
--- a/tests/test_finetuning.py
+++ b/tests/test_finetuning.py
@@ -6,7 +6,6 @@ from pytest import approx
 
																 from unittest.mock import patch
															
 
																 import torch
															
 
																-from torch.nn import Linear
															
 
																 from torch.optim import AdamW
															
 
																 from torch.utils.data.dataloader import DataLoader
															
 
																 from torch.utils.data.sampler import BatchSampler
															
@@ -45,7 +44,11 @@ def test_finetuning_no_validation(step_lr, optimizer, get_dataset, tokenizer, ge
 
																     assert isinstance(train_dataloader, DataLoader)
															
 
																     assert eval_dataloader is None
															
 
																-    assert get_model.return_value.to.call_args.args[0] == "cuda"
															
 
																+    if torch.cuda.is_available():
															
 
																+        assert get_model.return_value.to.call_count == 1
															
 
																+        assert get_model.return_value.to.call_args.args[0] == "cuda"
															
 
																+    else:
															
 
																+        assert get_model.return_value.to.call_count == 0
															
 
																 @patch('llama_recipes.finetuning.train')
															
@@ -69,7 +72,11 @@ def test_finetuning_with_validation(step_lr, optimizer, get_dataset, tokenizer,
 
																     assert isinstance(train_dataloader, DataLoader)
															
 
																     assert isinstance(eval_dataloader, DataLoader)
															
 
																-    assert get_model.return_value.to.call_args.args[0] == "cuda"
															
 
																+    if torch.cuda.is_available():
															
 
																+        assert get_model.return_value.to.call_count == 1
															
 
																+        assert get_model.return_value.to.call_args.args[0] == "cuda"
															
 
																+    else:
															
 
																+        assert get_model.return_value.to.call_count == 0
															
 
																 @patch('llama_recipes.finetuning.train')
															
@@ -87,7 +94,12 @@ def test_finetuning_peft(step_lr, optimizer, get_peft_model, gen_peft_config, ge
 
																     main(**kwargs)
															
 
																-    assert get_peft_model.return_value.to.call_args.args[0] == "cuda"
															
 
																+    if torch.cuda.is_available():
															
 
																+        assert get_model.return_value.to.call_count == 1
															
 
																+        assert get_model.return_value.to.call_args.args[0] == "cuda"
															
 
																+    else:
															
 
																+        assert get_model.return_value.to.call_count == 0
															
 
																+    
															
 
																     assert get_peft_model.return_value.print_trainable_parameters.call_count == 1
															
--- a/utils/memory_utils.py
+++ b/utils/memory_utils.py
@@ -1,83 +0,0 @@
 
																-# Copyright (c) Meta Platforms, Inc. and affiliates.
															
 
																-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
															
 
																-import gc
															
 
																-import os
															
 
																-import sys
															
 
																-import threading
															
 
																-
															
 
																-import numpy as np
															
 
																-import psutil
															
 
																-import torch
															
 
																-from accelerate.utils import is_xpu_available
															
 
																-
															
 
																-def byte2gb(x):
															
 
																-    return int(x / 2**30)
															
 
																-# This context manager is used to track the peak memory usage of the process
															
 
																-class MemoryTrace:
															
 
																-    def __enter__(self):
															
 
																-        gc.collect()
															
 
																-        if is_xpu_available():
															
 
																-            torch.xpu.empty_cache()
															
 
																-            torch.xpu.reset_max_memory_allocated()   # reset the peak gauge to zero
															
 
																-            self.begin = byte2gb(torch.xpu.memory_allocated())
															
 
																-        elif torch.cuda.is_available():
															
 
																-            torch.cuda.empty_cache()
															
 
																-            torch.cuda.reset_max_memory_allocated()  # reset the peak gauge to zero
															
 
																-            self.begin = byte2gb(torch.cuda.memory_allocated())
															
 
																-        self.process = psutil.Process()
															
 
																-        self.cpu_begin = byte2gb(self.cpu_mem_used())
															
 
																-        self.peak_monitoring = True
															
 
																-        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
															
 
																-        peak_monitor_thread.daemon = True
															
 
																-        peak_monitor_thread.start()
															
 
																-        return self
															
 
																-
															
 
																-    def cpu_mem_used(self):
															
 
																-        """get resident set size memory for the current process"""
															
 
																-        return self.process.memory_info().rss
															
 
																-
															
 
																-    def peak_monitor_func(self):
															
 
																-        self.cpu_peak = -1
															
 
																-
															
 
																-        while True:
															
 
																-            self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak)
															
 
																-
															
 
																-            # can't sleep or will not catch the peak right (this comment is here on purpose)
															
 
																-            # time.sleep(0.001) # 1msec
															
 
																-
															
 
																-            if not self.peak_monitoring:
															
 
																-                break
															
 
																-
															
 
																-    def __exit__(self, *exc):
															
 
																-        self.peak_monitoring = False
															
 
																-
															
 
																-        gc.collect()
															
 
																-        if is_xpu_available():
															
 
																-            torch.xpu.empty_cache()
															
 
																-            self.end = byte2gb(torch.xpu.memory_allocated())
															
 
																-            self.peak = byte2gb(torch.xpu.max_memory_allocated())
															
 
																-            xpu_info = torch.xpu.memory_stats()
															
 
																-            self.peak_active_gb = byte2gb(xpu_info["active_bytes.all.peak"])
															
 
																-            self.xpu_malloc_retires = xpu_info.get("num_alloc_retries", 0)
															
 
																-            self.peak_active_gb = byte2gb(xpu_info["active_bytes.all.peak"])
															
 
																-            self.m_xpu_ooms = xpu_info.get("num_ooms", 0)
															
 
																-            self.used = byte2gb(self.end - self.begin)
															
 
																-            self.peaked = byte2gb(self.peak - self.begin)
															
 
																-            self.max_reserved = byte2gb(torch.xpu.max_memory_reserved())
															
 
																-        else:
															
 
																-            torch.cuda.empty_cache()
															
 
																-            self.end = byte2gb(torch.cuda.memory_allocated())
															
 
																-            self.peak = byte2gb(torch.cuda.max_memory_allocated())
															
 
																-            cuda_info = torch.cuda.memory_stats()
															
 
																-            self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"])
															
 
																-            self.cuda_malloc_retires = cuda_info.get("num_alloc_retries", 0)
															
 
																-            self.peak_active_gb = byte2gb(cuda_info["active_bytes.all.peak"])
															
 
																-            self.m_cuda_ooms = cuda_info.get("num_ooms", 0)
															
 
																-            self.used = byte2gb(self.end - self.begin)
															
 
																-            self.peaked = byte2gb(self.peak - self.begin)
															
 
																-            self.max_reserved = byte2gb(torch.cuda.max_memory_reserved())
															
 
																-
															
 
																-        self.cpu_end = self.cpu_mem_used()
															
 
																-        self.cpu_used = byte2gb(self.cpu_end - self.cpu_begin)
															
 
																-        self.cpu_peaked = byte2gb(self.cpu_peak - self.cpu_begin)
															
 
																-        # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")