1 year ago · c62428b99c
--- a/inference/code-llama/code_completion_example.py
+++ b/inference/code-llama/code_completion_example.py
@@ -25,8 +25,8 @@ def main(
 
				     do_sample: bool=True, #Whether or not to use sampling ; use greedy decoding otherwise.
			
 
				     min_length: int=None, #The minimum length of the sequence to be generated, input prompt + min_new_tokens
			
 
				     use_cache: bool=True,  #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
			
 
				-    top_p: float=1.0, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
			
 
				-    temperature: float=1.0, # [optional] The value used to modulate the next token probabilities.
			
 
				+    top_p: float=0.9, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
			
 
				+    temperature: float=0.6, # [optional] The value used to modulate the next token probabilities.
			
 
				     top_k: int=50, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
			
 
				     repetition_penalty: float=1.0, #The parameter for repetition penalty. 1.0 means no penalty.
			
 
				     length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation. 
			
--- a/inference/code-llama/code_infilling_example.py
+++ b/inference/code-llama/code_infilling_example.py
@@ -25,8 +25,8 @@ def main(
 
				     do_sample: bool=True, #Whether or not to use sampling ; use greedy decoding otherwise.
			
 
				     min_length: int=None, #The minimum length of the sequence to be generated, input prompt + min_new_tokens
			
 
				     use_cache: bool=True,  #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
			
 
				-    top_p: float=1.0, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
			
 
				-    temperature: float=1.0, # [optional] The value used to modulate the next token probabilities.
			
 
				+    top_p: float=0.9, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
			
 
				+    temperature: float=0.6, # [optional] The value used to modulate the next token probabilities.
			
 
				     top_k: int=50, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
			
 
				     repetition_penalty: float=1.0, #The parameter for repetition penalty. 1.0 means no penalty.
			
 
				     length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation.