1 year ago · 6853267317
--- a/examples/code_llama/code_completion_example.py
+++ b/examples/code_llama/code_completion_example.py
@@ -51,24 +51,12 @@ def main(
 
				     torch.cuda.manual_seed(seed)
			
 
				     torch.manual_seed(seed)
			
 
				     
			
 
				-    model = load_model(model_name, quantization)
			
 
				+    model = load_model(model_name, quantization, use_fast_kernels)
			
 
				     if peft_model:
			
 
				         model = load_peft_model(model, peft_model)
			
 
				 
			
 
				     model.eval()
			
 
				     
			
 
				-    if use_fast_kernels:
			
 
				-        """
			
 
				-        Setting 'use_fast_kernels' will enable
			
 
				-        using of Flash Attention or Xformer memory-efficient kernels 
			
 
				-        based on the hardware being used. This would speed up inference when used for batched inputs.
			
 
				-        """
			
 
				-        try:
			
 
				-            from optimum.bettertransformer import BetterTransformer
			
 
				-            model = BetterTransformer.transform(model)    
			
 
				-        except ImportError:
			
 
				-            print("Module 'optimum' not found. Please install 'optimum' it before proceeding.")
			
 
				-
			
 
				     tokenizer = AutoTokenizer.from_pretrained(model_name)
			
 
				     safety_checker = get_safety_checker(enable_azure_content_safety,
			
 
				                                         enable_sensitive_topics,
			
--- a/examples/code_llama/code_infilling_example.py
+++ b/examples/code_llama/code_infilling_example.py
@@ -49,25 +49,13 @@ def main(
 
				     torch.cuda.manual_seed(seed)
			
 
				     torch.manual_seed(seed)
			
 
				     
			
 
				-    model = load_model(model_name, quantization)
			
 
				+    model = load_model(model_name, quantization, use_fast_kernels)
			
 
				     model.config.tp_size=1
			
 
				     if peft_model:
			
 
				         model = load_peft_model(model, peft_model)
			
 
				 
			
 
				     model.eval()
			
 
				-    
			
 
				-    if use_fast_kernels:
			
 
				-        """
			
 
				-        Setting 'use_fast_kernels' will enable
			
 
				-        using of Flash Attention or Xformer memory-efficient kernels 
			
 
				-        based on the hardware being used. This would speed up inference when used for batched inputs.
			
 
				-        """
			
 
				-        try:
			
 
				-            from optimum.bettertransformer import BetterTransformer
			
 
				-            model = BetterTransformer.transform(model)    
			
 
				-        except ImportError:
			
 
				-            print("Module 'optimum' not found. Please install 'optimum' it before proceeding.")
			
 
				-
			
 
				+   
			
 
				     tokenizer = AutoTokenizer.from_pretrained(model_name)
			
 
				     
			
 
				     safety_checker = get_safety_checker(enable_azure_content_safety,
			
--- a/examples/code_llama/code_instruct_example.py
+++ b/examples/code_llama/code_instruct_example.py
@@ -84,22 +84,11 @@ def main(
 
				     torch.cuda.manual_seed(seed)
			
 
				     torch.manual_seed(seed)
			
 
				     
			
 
				-    model = load_model(model_name, quantization)
			
 
				+    model = load_model(model_name, quantization, use_fast_kernels)
			
 
				     if peft_model:
			
 
				         model = load_peft_model(model, peft_model)
			
 
				 
			
 
				     model.eval()
			
 
				-    
			
 
				-    if use_fast_kernels:
			
 
				-        """
			
 
				-        Setting 'use_fast_kernels' will enable
			
 
				-        using of Flash Attention or Xformer memory-efficient kernels 
			
 
				-        based on the hardware being used. This would speed up inference when used for batched inputs.
			
 
				-        """
			
 
				-        try:
			
 
				-            model.to_bettertransformer()   
			
 
				-        except ImportError:
			
 
				-            print("Please check the Transformers version that support to_bettertransformer natively.")
			
 
				         
			
 
				     tokenizer = AutoTokenizer.from_pretrained(model_name)
			
 
				     safety_checker = get_safety_checker(enable_azure_content_safety,