1 年之前 · c014ae7cb8
--- a/inference/code-llama/code_completion_example.py
+++ b/inference/code-llama/code_completion_example.py
@@ -34,7 +34,7 @@ def main(
 
				     enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs
			
 
				     enable_salesforce_content_safety: bool=True, # Enable safety check with Salesforce safety flan t5
			
 
				     max_padding_length: int=None, # the max padding length to be used with tokenizer padding the prompts.
			
 
				-    use_fast_kernels: bool = False, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
			
 
				+    use_fast_kernels: bool = True, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
			
 
				     **kwargs
			
 
				 ):
			
 
				     if prompt_file is not None:
			
--- a/inference/code-llama/code_infilling_example.py
+++ b/inference/code-llama/code_infilling_example.py
@@ -34,7 +34,7 @@ def main(
 
				     enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs
			
 
				     enable_salesforce_content_safety: bool=True, # Enable safety check with Salesforce safety flan t5
			
 
				     max_padding_length: int=None, # the max padding length to be used with tokenizer padding the prompts.
			
 
				-    use_fast_kernels: bool = False, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
			
 
				+    use_fast_kernels: bool = True, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
			
 
				     **kwargs
			
 
				 ):
			
 
				     if prompt_file is not None: