grammar_dataset_process.ipynb 10 KB

Copyright (c) Meta Platforms, Inc. and affiliates. This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

Use this notebook to pull in datasets and apply pre-processing. Most grammar datasets unfortunately require preprocessing before being usable in training. (example - jfleg has 4 targets per input, so we have to rematch as 1:1 pairings)

import csv
from datasets import load_metric, load_dataset
from pathlib import Path
list_replacements = [
  (" .", "."), 
  (" ,", ","),
  (" '", "'"),
  (" ?", "?"),
  (" !", "!"),
  (" :", "!"),
  (" ;", "!"),
  (" n't", "n't"),
  (" v", "n't"),
  ("2 0 0 6", "2006"),
  ("5 5", "55"),
  ("4 0 0", "400"),
  ("1 7-5 0", "1750"),
  ("2 0 %", "20%"),
  ("5 0", "50"),
  ("1 2", "12"),
  ("1 0", "10"),
  ('" ballast water', '"ballast water')
  ]
def correct_spacing(item):
    """ we iterate through the list of all replacements per each item in dataset"""
    for fix in list_replacements:
        item = item.replace(fix[0], fix[1])
    return item

def generate_csv(csv_path, dataset):
    """ apply spacing corrections and save out matched pairs to csv file as dataset"""
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["input", "target"])
        for case in dataset:
     	    # Adding the t5 task indication prefix to input 
            input_text = case["sentence"]
            input_text = correct_spacing(input_text)

            for correction in case["corrections"]:
              correction = correct_spacing(correction)
              # a few of the cases contain blank strings. 
              if input_text and correction:
                writer.writerow([input_text, correction])

In Jfleg - validation will be used as 'train', test will be 'validation'

train_dataset = load_dataset("jfleg", split='validation[:]') 
eval_dataset = load_dataset("jfleg", split='test[:]')
Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b)
Found cached dataset jfleg (/data/home/mreso/.cache/huggingface/datasets/jfleg/default/1.0.0/ed4ab2367351fe31949f48849ae6732b164f0d5ea6bb5d4357ff4293ac89511b)
print(train_dataset)
print(eval_dataset)
Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 755
})
Dataset({
    features: ['sentence', 'corrections'],
    num_rows: 748
})
print(train_dataset['sentence'][22])
print(train_dataset['corrections'][22])
Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas . 
['Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become experts in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . ', 'Students can focus on only a few subjects they are interested in and they will become an expert in those areas . ']
clean22 = correct_spacing(train_dataset['sentence'][22])
clean22
'Students can focus on only a few subjects they are intwerested in and they will become an experts in those areas. '
jfleg_dir = Path.cwd()/'jfleg_dataset'  # if you only use 'jfleg', hf will try and use that and complain
jfleg_dir.mkdir(parents=True,exist_ok=True)
c4_dir = Path.cwd()/'c4_dataset'
c4_dir.mkdir(parents=True,exist_ok=True)

Process Jfleg data

j_train_file = jfleg_dir/'jtrain.csv'
j_eval_file = jfleg_dir/'jeval.csv'
generate_csv(j_train_file, train_dataset)
generate_csv(j_eval_file, eval_dataset)

Process C4_200M (!) - we'll pull 10K to start

c4_dataset = load_dataset("liweili/c4_200m", streaming = True)
iterator = iter(c4_dataset['train'])
def c4_generate_csv(csv_path, iterator, num_examples):
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["input", "target"])
        for i in range(0,num_examples):
          data = next(iterator)
          input_text = data["input"]
          input_text = correct_spacing(input_text)
          correction = correct_spacing(data["output"])
          if input_text and correction:
            writer.writerow([input_text, correction])
c4_dir = Path.cwd()/'c4_dataset'
c4_dir.mkdir(parents=True,exist_ok=True)

You can modify the following to make the csv file with desired number of instances, here we go for 10k to make a quick test

c4_filename = c4_dir/'c4train_10k.csv'
c4_generate_csv(c4_filename, iterator, num_examples=10000)

Create a single training file by combining jtrain and c4train

merge_list = [j_train_file, c4_filename, ]
import pandas as pd
combined_csv = pd.concat([pd.read_csv(fn) for fn in merge_list])
merged_name = "gtrain_10k.csv"
combined_csv.to_csv(merged_name, index=False, encoding = 'utf-8-sig', )
eval_name = "grammar_validation.csv"
eval_csv = pd.read_csv(j_eval_file)
eval_csv.to_csv(eval_name, index=False, encoding = 'utf-8-sig', )