tokenizer.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. import os
  4. from logging import getLogger
  5. from typing import List
  6. from sentencepiece import SentencePieceProcessor
  7. logger = getLogger()
  8. class Tokenizer:
  9. """tokenizing and encoding/decoding text using SentencePiece."""
  10. def __init__(self, model_path: str):
  11. """
  12. Initializes the Tokenizer with a SentencePiece model.
  13. Args:
  14. model_path (str): The path to the SentencePiece model file.
  15. """
  16. # reload tokenizer
  17. assert os.path.isfile(model_path), model_path
  18. self.sp_model = SentencePieceProcessor(model_file=model_path)
  19. logger.info(f"Reloaded SentencePiece model from {model_path}")
  20. # BOS / EOS token IDs
  21. self.n_words: int = self.sp_model.vocab_size()
  22. self.bos_id: int = self.sp_model.bos_id()
  23. self.eos_id: int = self.sp_model.eos_id()
  24. self.pad_id: int = self.sp_model.pad_id()
  25. logger.info(
  26. f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
  27. )
  28. assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
  29. def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
  30. """
  31. Encodes a string into a list of token IDs.
  32. Args:
  33. s (str): The input string to be encoded.
  34. bos (bool): Whether to prepend the beginning-of-sequence token.
  35. eos (bool): Whether to append the end-of-sequence token.
  36. Returns:
  37. List[int]: A list of token IDs.
  38. """
  39. assert type(s) is str
  40. t = self.sp_model.encode(s)
  41. if bos:
  42. t = [self.bos_id] + t
  43. if eos:
  44. t = t + [self.eos_id]
  45. return t
  46. def decode(self, t: List[int]) -> str:
  47. """
  48. Decodes a list of token IDs into a string.
  49. Args:
  50. t (List[int]): The list of token IDs to be decoded.
  51. Returns:
  52. str: The decoded string.
  53. """
  54. return self.sp_model.decode(t)