generation.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. import json
  4. import os
  5. import sys
  6. import time
  7. from pathlib import Path
  8. from typing import List, Literal, Optional, Tuple, TypedDict
  9. import torch
  10. import torch.nn.functional as F
  11. from fairscale.nn.model_parallel.initialize import (
  12. get_model_parallel_rank,
  13. initialize_model_parallel,
  14. model_parallel_is_initialized,
  15. )
  16. from examples.llama_guard.model import ModelArgs, Transformer
  17. from examples.llama_guard.tokenizer import Tokenizer
  18. Role = Literal["system", "user", "assistant"]
  19. class Message(TypedDict):
  20. role: Role
  21. content: str
  22. class CompletionPrediction(TypedDict, total=False):
  23. generation: str
  24. tokens: List[str] # not required
  25. logprobs: List[float] # not required
  26. class ChatPrediction(TypedDict, total=False):
  27. generation: Message
  28. tokens: List[str] # not required
  29. logprobs: List[float] # not required
  30. Dialog = List[Message]
  31. B_INST, E_INST = "[INST]", "[/INST]"
  32. B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
  33. SPECIAL_TAGS = [B_INST, E_INST, "<<SYS>>", "<</SYS>>"]
  34. UNSAFE_ERROR = "Error: special tags are not allowed as part of the prompt."
  35. class Llama:
  36. @staticmethod
  37. def build(
  38. ckpt_dir: str,
  39. tokenizer_path: str,
  40. max_seq_len: int,
  41. max_batch_size: int,
  42. model_parallel_size: Optional[int] = None,
  43. seed: int = 1,
  44. ) -> "Llama":
  45. """
  46. Build a Llama instance by initializing and loading a pre-trained model.
  47. Args:
  48. ckpt_dir (str): Path to the directory containing checkpoint files.
  49. tokenizer_path (str): Path to the tokenizer file.
  50. max_seq_len (int): Maximum sequence length for input text.
  51. max_batch_size (int): Maximum batch size for inference.
  52. model_parallel_size (Optional[int], optional): Number of model parallel processes.
  53. If not provided, it's determined from the environment. Defaults to None.
  54. Returns:
  55. Llama: An instance of the Llama class with the loaded model and tokenizer.
  56. Raises:
  57. AssertionError: If there are no checkpoint files in the specified directory,
  58. or if the model parallel size does not match the number of checkpoint files.
  59. Note:
  60. This method initializes the distributed process group, sets the device to CUDA,
  61. and loads the pre-trained model and tokenizer.
  62. """
  63. if not torch.distributed.is_initialized():
  64. torch.distributed.init_process_group("nccl")
  65. if not model_parallel_is_initialized():
  66. if model_parallel_size is None:
  67. model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
  68. initialize_model_parallel(model_parallel_size)
  69. local_rank = int(os.environ.get("LOCAL_RANK", 0))
  70. torch.cuda.set_device(local_rank)
  71. # seed must be the same in all processes
  72. torch.manual_seed(seed)
  73. if local_rank > 0:
  74. sys.stdout = open(os.devnull, "w")
  75. start_time = time.time()
  76. checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
  77. checkpoints_size = len(checkpoints)
  78. assert checkpoints_size > 0, f"no checkpoint files found in {ckpt_dir}"
  79. ckpt_path = checkpoints[get_model_parallel_rank()]
  80. checkpoint = torch.load(ckpt_path, map_location="cpu")
  81. with open(Path(ckpt_dir) / "params.json", "r") as f:
  82. params = json.loads(f.read())
  83. model_args: ModelArgs = ModelArgs(
  84. max_seq_len=max_seq_len,
  85. max_batch_size=max_batch_size,
  86. **params,
  87. )
  88. tokenizer = Tokenizer(model_path=tokenizer_path)
  89. model_args.vocab_size = tokenizer.n_words
  90. torch.set_default_tensor_type(torch.cuda.HalfTensor)
  91. model = Transformer(model_args)
  92. model.load_state_dict(checkpoint, strict=False)
  93. print(f"Loaded in {time.time() - start_time:.2f} seconds")
  94. return Llama(model, tokenizer)
  95. def __init__(self, model: Transformer, tokenizer: Tokenizer):
  96. self.model = model
  97. self.tokenizer = tokenizer
  98. @torch.inference_mode()
  99. def generate(
  100. self,
  101. prompt_tokens: List[List[int]],
  102. max_gen_len: int,
  103. temperature: float = 0.6,
  104. top_p: float = 0.9,
  105. logprobs: bool = False,
  106. echo: bool = False,
  107. ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
  108. """
  109. Generate text sequences based on provided prompts using the language generation model.
  110. Args:
  111. prompt_tokens (List[List[int]]): List of tokenized prompts, where each prompt is represented as a list of integers.
  112. max_gen_len (int): Maximum length of the generated text sequence.
  113. temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
  114. top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
  115. logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
  116. echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
  117. Returns:
  118. Tuple[List[List[int]], Optional[List[List[float]]]]: A tuple containing generated token sequences and, if logprobs is True, corresponding token log probabilities.
  119. Note:
  120. This method uses the provided prompts as a basis for generating text. It employs nucleus sampling to produce text with controlled randomness.
  121. If logprobs is True, token log probabilities are computed for each generated token.
  122. """
  123. params = self.model.params
  124. bsz = len(prompt_tokens)
  125. assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
  126. min_prompt_len = min(len(t) for t in prompt_tokens)
  127. max_prompt_len = max(len(t) for t in prompt_tokens)
  128. assert max_prompt_len <= params.max_seq_len
  129. total_len = min(params.max_seq_len, max_gen_len + max_prompt_len)
  130. pad_id = self.tokenizer.pad_id
  131. tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
  132. for k, t in enumerate(prompt_tokens):
  133. tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
  134. if logprobs:
  135. token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
  136. prev_pos = 0
  137. eos_reached = torch.tensor([False] * bsz, device="cuda")
  138. input_text_mask = tokens != pad_id
  139. if min_prompt_len == total_len:
  140. logits = self.model.forward(tokens, prev_pos)
  141. token_logprobs = -F.cross_entropy(
  142. input=logits.transpose(1, 2),
  143. target=tokens,
  144. reduction="none",
  145. ignore_index=pad_id,
  146. )
  147. for cur_pos in range(min_prompt_len, total_len):
  148. logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
  149. if temperature > 0:
  150. probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
  151. next_token = sample_top_p(probs, top_p)
  152. else:
  153. next_token = torch.argmax(logits[:, -1], dim=-1)
  154. next_token = next_token.reshape(-1)
  155. # only replace token if prompt has already been generated
  156. next_token = torch.where(
  157. input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
  158. )
  159. tokens[:, cur_pos] = next_token
  160. if logprobs:
  161. token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
  162. input=logits.transpose(1, 2),
  163. target=tokens[:, prev_pos + 1 : cur_pos + 1],
  164. reduction="none",
  165. ignore_index=pad_id,
  166. )
  167. eos_reached |= (~input_text_mask[:, cur_pos]) & (
  168. next_token == self.tokenizer.eos_id
  169. )
  170. prev_pos = cur_pos
  171. if all(eos_reached):
  172. break
  173. if logprobs:
  174. token_logprobs = token_logprobs.tolist()
  175. out_tokens, out_logprobs = [], []
  176. for i, toks in enumerate(tokens.tolist()):
  177. # cut to max gen len
  178. start = 0 if echo else len(prompt_tokens[i])
  179. toks = toks[start : len(prompt_tokens[i]) + max_gen_len]
  180. probs = None
  181. if logprobs:
  182. probs = token_logprobs[i][start : len(prompt_tokens[i]) + max_gen_len]
  183. # cut to eos tok if any
  184. if self.tokenizer.eos_id in toks:
  185. eos_idx = toks.index(self.tokenizer.eos_id)
  186. toks = toks[:eos_idx]
  187. probs = probs[:eos_idx] if logprobs else None
  188. out_tokens.append(toks)
  189. out_logprobs.append(probs)
  190. return (out_tokens, out_logprobs if logprobs else None)
  191. def text_completion(
  192. self,
  193. prompts: List[str],
  194. temperature: float = 0.6,
  195. top_p: float = 0.9,
  196. max_gen_len: Optional[int] = None,
  197. logprobs: bool = False,
  198. echo: bool = False,
  199. ) -> List[CompletionPrediction]:
  200. """
  201. Perform text completion for a list of prompts using the language generation model.
  202. Args:
  203. prompts (List[str]): List of text prompts for completion.
  204. temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
  205. top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
  206. max_gen_len (Optional[int], optional): Maximum length of the generated completion sequence.
  207. If not provided, it's set to the model's maximum sequence length minus 1.
  208. logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
  209. echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
  210. Returns:
  211. List[CompletionPrediction]: List of completion predictions, each containing the generated text completion.
  212. Note:
  213. This method generates text completions for the provided prompts, employing nucleus sampling to introduce controlled randomness.
  214. If logprobs is True, token log probabilities are computed for each generated token.
  215. """
  216. if max_gen_len is None:
  217. max_gen_len = self.model.params.max_seq_len - 1
  218. prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
  219. generation_tokens, generation_logprobs = self.generate(
  220. prompt_tokens=prompt_tokens,
  221. max_gen_len=max_gen_len,
  222. temperature=temperature,
  223. top_p=top_p,
  224. logprobs=logprobs,
  225. echo=echo,
  226. )
  227. if logprobs:
  228. return [
  229. {
  230. "generation": self.tokenizer.decode(t),
  231. "tokens": [self.tokenizer.decode(x) for x in t],
  232. "logprobs": logprobs_i,
  233. }
  234. for t, logprobs_i in zip(generation_tokens, generation_logprobs)
  235. ]
  236. return [{"generation": self.tokenizer.decode(t)} for t in generation_tokens]
  237. def chat_completion(
  238. self,
  239. dialogs: List[Dialog],
  240. temperature: float = 0.6,
  241. top_p: float = 0.9,
  242. max_gen_len: Optional[int] = None,
  243. logprobs: bool = False,
  244. ) -> List[ChatPrediction]:
  245. """
  246. Generate assistant responses for a list of conversational dialogs using the language generation model.
  247. Args:
  248. dialogs (List[Dialog]): List of conversational dialogs, where each dialog is a list of messages.
  249. temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
  250. top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
  251. max_gen_len (Optional[int], optional): Maximum length of the generated response sequence.
  252. If not provided, it's set to the model's maximum sequence length minus 1.
  253. logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
  254. Returns:
  255. List[ChatPrediction]: List of chat predictions, each containing the assistant's generated response.
  256. Raises:
  257. AssertionError: If the last message in a dialog is not from the user.
  258. AssertionError: If the dialog roles are not in the required 'user', 'assistant', and optional 'system' order.
  259. Note:
  260. This method generates assistant responses for the provided conversational dialogs.
  261. It employs nucleus sampling to introduce controlled randomness in text generation.
  262. If logprobs is True, token log probabilities are computed for each generated token.
  263. """
  264. if max_gen_len is None:
  265. max_gen_len = self.model.params.max_seq_len - 1
  266. prompt_tokens = []
  267. unsafe_requests = []
  268. for dialog in dialogs:
  269. unsafe_requests.append(
  270. any([tag in msg["content"] for tag in SPECIAL_TAGS for msg in dialog])
  271. )
  272. if dialog[0]["role"] == "system":
  273. dialog = [
  274. {
  275. "role": dialog[1]["role"],
  276. "content": B_SYS
  277. + dialog[0]["content"]
  278. + E_SYS
  279. + dialog[1]["content"],
  280. }
  281. ] + dialog[2:]
  282. assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
  283. [msg["role"] == "assistant" for msg in dialog[1::2]]
  284. ), (
  285. "model only supports 'system', 'user' and 'assistant' roles, "
  286. "starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
  287. )
  288. dialog_tokens: List[int] = sum(
  289. [
  290. self.tokenizer.encode(
  291. f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ",
  292. bos=True,
  293. eos=True,
  294. )
  295. for prompt, answer in zip(
  296. dialog[::2],
  297. dialog[1::2],
  298. )
  299. ],
  300. [],
  301. )
  302. assert (
  303. dialog[-1]["role"] == "user"
  304. ), f"Last message must be from user, got {dialog[-1]['role']}"
  305. dialog_tokens += self.tokenizer.encode(
  306. f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}",
  307. bos=True,
  308. eos=False,
  309. )
  310. prompt_tokens.append(dialog_tokens)
  311. generation_tokens, generation_logprobs = self.generate(
  312. prompt_tokens=prompt_tokens,
  313. max_gen_len=max_gen_len,
  314. temperature=temperature,
  315. top_p=top_p,
  316. logprobs=logprobs,
  317. )
  318. if logprobs:
  319. return [
  320. {
  321. "generation": {
  322. "role": "assistant",
  323. "content": self.tokenizer.decode(t)
  324. if not unsafe
  325. else UNSAFE_ERROR,
  326. },
  327. "tokens": [self.tokenizer.decode(x) for x in t],
  328. "logprobs": logprobs_i,
  329. }
  330. for t, logprobs_i, unsafe in zip(
  331. generation_tokens, generation_logprobs, unsafe_requests
  332. )
  333. ]
  334. return [
  335. {
  336. "generation": {
  337. "role": "assistant",
  338. "content": self.tokenizer.decode(t) if not unsafe else UNSAFE_ERROR,
  339. }
  340. }
  341. for t, unsafe in zip(generation_tokens, unsafe_requests)
  342. ]
  343. def single_prompt_completion(
  344. self,
  345. prompt: str,
  346. temperature: float = 0.6,
  347. top_p: float = 0.9,
  348. max_gen_len: Optional[int] = None,
  349. echo: bool = False,
  350. ) -> str:
  351. """
  352. Perform text completion for a single prompt using the language generation model.
  353. Args:
  354. prompts (str): prompt for completion.
  355. temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
  356. top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
  357. max_gen_len (Optional[int], optional): Maximum length of the generated completion sequence.
  358. If not provided, it's set to the model's maximum sequence length minus 1.
  359. Returns:
  360. str: single string with the decoded output from the model.
  361. Note:
  362. This method generates text completions for the provided prompts, employing nucleus sampling to introduce controlled randomness.
  363. """
  364. if max_gen_len is None:
  365. max_gen_len = self.model.params.max_seq_len - 1
  366. prompt_tokens = [self.tokenizer.encode(f"{B_INST} {prompt.strip()} {E_INST}", bos=True, eos=False)]
  367. generation_tokens = self.generate(
  368. prompt_tokens=prompt_tokens,
  369. max_gen_len=max_gen_len,
  370. temperature=temperature,
  371. top_p=top_p,
  372. logprobs=False,
  373. echo=echo,
  374. )
  375. single_result_list = self.tokenizer.decode(generation_tokens[0])
  376. return single_result_list[0]
  377. def sample_top_p(probs, p):
  378. """
  379. Perform top-p (nucleus) sampling on a probability distribution.
  380. Args:
  381. probs (torch.Tensor): Probability distribution tensor.
  382. p (float): Probability threshold for top-p sampling.
  383. Returns:
  384. torch.Tensor: Sampled token indices.
  385. Note:
  386. Top-p sampling selects the smallest set of tokens whose cumulative probability mass
  387. exceeds the threshold p. The distribution is renormalized based on the selected tokens.
  388. """
  389. probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
  390. probs_sum = torch.cumsum(probs_sort, dim=-1)
  391. mask = probs_sum - probs_sort > p
  392. probs_sort[mask] = 0.0
  393. probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
  394. next_token = torch.multinomial(probs_sort, num_samples=1)
  395. next_token = torch.gather(probs_idx, -1, next_token)
  396. return next_token