code_completion_example.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. # from accelerate import init_empty_weights, load_checkpoint_and_dispatch
  4. import fire
  5. import torch
  6. import os
  7. import sys
  8. import time
  9. from typing import List
  10. from transformers import CodeLlamaTokenizer
  11. sys.path.append("..")
  12. from safety_utils import get_safety_checker
  13. from model_utils import load_model, load_peft_model, load_llama_from_config
  14. def main(
  15. model_name,
  16. peft_model: str=None,
  17. quantization: bool=False,
  18. max_new_tokens =100, #The maximum numbers of tokens to generate
  19. prompt_file: str=None,
  20. seed: int=42, #seed value for reproducibility
  21. do_sample: bool=True, #Whether or not to use sampling ; use greedy decoding otherwise.
  22. min_length: int=None, #The minimum length of the sequence to be generated, input prompt + min_new_tokens
  23. use_cache: bool=True, #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
  24. top_p: float=1.0, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  25. temperature: float=1.0, # [optional] The value used to modulate the next token probabilities.
  26. top_k: int=50, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
  27. repetition_penalty: float=1.0, #The parameter for repetition penalty. 1.0 means no penalty.
  28. length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation.
  29. enable_azure_content_safety: bool=False, # Enable safety check with Azure content safety api
  30. enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs
  31. enable_salesforce_content_safety: bool=True, # Enable safety check with Salesforce safety flan t5
  32. max_padding_length: int=None, # the max padding length to be used with tokenizer padding the prompts.
  33. use_fast_kernels: bool = False, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
  34. **kwargs
  35. ):
  36. if prompt_file is not None:
  37. assert os.path.exists(
  38. prompt_file
  39. ), f"Provided Prompt file does not exist {prompt_file}"
  40. with open(prompt_file, "r") as f:
  41. user_prompt = f.read()
  42. else:
  43. print("No user prompt provided. Exiting.")
  44. sys.exit(1)
  45. # Set the seeds for reproducibility
  46. torch.cuda.manual_seed(seed)
  47. torch.manual_seed(seed)
  48. model = load_model(model_name, quantization)
  49. if peft_model:
  50. model = load_peft_model(model, peft_model)
  51. model.eval()
  52. if use_fast_kernels:
  53. """
  54. Setting 'use_fast_kernels' will enable
  55. using of Flash Attention or Xformer memory-efficient kernels
  56. based on the hardware being used. This would speed up inference when used for batched inputs.
  57. """
  58. try:
  59. from optimum.bettertransformer import BetterTransformer
  60. model = BetterTransformer.transform(model)
  61. except ImportError:
  62. print("Module 'optimum' not found. Please install 'optimum' it before proceeding.")
  63. tokenizer = CodeLlamaTokenizer.from_pretrained(model_name)
  64. tokenizer.add_special_tokens(
  65. {
  66. "pad_token": "<PAD>",
  67. }
  68. )
  69. model.resize_token_embeddings(model.config.vocab_size + 1)
  70. safety_checker = get_safety_checker(enable_azure_content_safety,
  71. enable_sensitive_topics,
  72. enable_salesforce_content_safety,
  73. )
  74. # Safety check of the user prompt
  75. safety_results = [check(user_prompt) for check in safety_checker]
  76. are_safe = all([r[1] for r in safety_results])
  77. if are_safe:
  78. print("User prompt deemed safe.")
  79. print(f"User prompt:\n{user_prompt}")
  80. else:
  81. print("User prompt deemed unsafe.")
  82. for method, is_safe, report in safety_results:
  83. if not is_safe:
  84. print(method)
  85. print(report)
  86. print("Skipping the inference as the prompt is not safe.")
  87. sys.exit(1) # Exit the program with an error status
  88. batch = tokenizer(user_prompt, padding='max_length', truncation=True, max_length=max_padding_length, return_tensors="pt")
  89. batch = {k: v.to("cuda") for k, v in batch.items()}
  90. start = time.perf_counter()
  91. with torch.no_grad():
  92. outputs = model.generate(
  93. **batch,
  94. max_new_tokens=max_new_tokens,
  95. do_sample=do_sample,
  96. top_p=top_p,
  97. temperature=temperature,
  98. min_length=min_length,
  99. use_cache=use_cache,
  100. top_k=top_k,
  101. repetition_penalty=repetition_penalty,
  102. length_penalty=length_penalty,
  103. **kwargs
  104. )
  105. e2e_inference_time = (time.perf_counter()-start)*1000
  106. print(f"the inference time is {e2e_inference_time} ms")
  107. output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  108. # Safety check of the model output
  109. safety_results = [check(output_text) for check in safety_checker]
  110. are_safe = all([r[1] for r in safety_results])
  111. if are_safe:
  112. print("User input and model output deemed safe.")
  113. print(f"Model output:\n{output_text}")
  114. else:
  115. print("Model output deemed unsafe.")
  116. for method, is_safe, report in safety_results:
  117. if not is_safe:
  118. print(method)
  119. print(report)
  120. if __name__ == "__main__":
  121. fire.Fire(main)