1234567891011121314151617181920212223242526272829303132 |
- # Copyright (c) Meta Platforms, Inc. and affiliates.
- # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
- # For dataset details visit: https://huggingface.co/datasets/samsum
- import datasets
- from .utils import Concatenator
- def get_preprocessed_samsum(dataset_config, tokenizer, split):
- dataset = datasets.load_dataset("samsum", split=split)
- prompt = (
- f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n{{summary}}{{eos_token}}"
- )
- def apply_prompt_template(sample):
- return {
- "text": prompt.format(
- dialog=sample["dialogue"],
- summary=sample["summary"],
- eos_token=tokenizer.eos_token,
- )
- }
- dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
-
- dataset = dataset.map(
- lambda sample: tokenizer(sample["text"]),
- batched=True,
- remove_columns=list(dataset.features),
- ).map(Concatenator(), batched=True)
- return dataset
|