12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- # Copyright (c) Meta Platforms, Inc. and affiliates.
- # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
- from typing import List, Literal, Optional, Tuple, TypedDict, Union
- import json
- Role = Literal["user", "assistant"]
- class Message(TypedDict):
- role: Role
- content: str
- Dialog = List[Message]
- B_INST, E_INST = "[INST]", "[/INST]"
- B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
- def format_tokens(dialogs, tokenizer):
- prompt_tokens = []
- for dialog in dialogs:
- if dialog[0]["role"] == "system":
- dialog = [
- {
- "role": dialog[1]["role"],
- "content": B_SYS
- + dialog[0]["content"]
- + E_SYS
- + dialog[1]["content"],
- }
- ] + dialog[2:]
- assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
- [msg["role"] == "assistant" for msg in dialog[1::2]]
- ), (
- "model only supports 'system','user' and 'assistant' roles, "
- "starting with user and alternating (u/a/u/a/u...)"
- )
- """
- Please verify that your tokenizer support adding "[INST]", "[/INST]" to your inputs.
- Here, we are adding it manually.
- """
- dialog_tokens: List[int] = sum(
- [
- tokenizer.encode(
- f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} ",
- )
- for prompt, answer in zip(dialog[::2], dialog[1::2])
- ],
- [],
- )
- assert (
- dialog[-1]["role"] == "user"
- ), f"Last message must be from user, got {dialog[-1]['role']}"
- dialog_tokens += tokenizer.encode(
- f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}",
- )
- prompt_tokens.append(dialog_tokens)
- return prompt_tokens
-
- def read_dialogs_from_file(file_path):
- with open(file_path, 'r') as file:
- dialogs = json.load(file)
- return dialogs
|