train_tokenizer.py 609 B

12345678910111213141516171819202122
  1. import fire
  2. import os
  3. import sentencepiece as spm
  4. def main(data_file, save_path, vocab_size=16_000, num_threads=8):
  5. os.makedirs(save_path, exist_ok=True)
  6. tokenizer_name = os.path.join(save_path, "tokenizer")
  7. spm.SentencePieceTrainer.train(
  8. input=data_file,
  9. model_prefix=tokenizer_name,
  10. vocab_size=vocab_size,
  11. num_threads=num_threads,
  12. model_type="bpe",
  13. max_sentence_length=1073741824,
  14. shuffle_input_sentence="true",
  15. character_coverage=1.0,
  16. hard_vocab_limit="false",
  17. )
  18. if __name__ == "__main__":
  19. fire.Fire(main)