File size: 490 Bytes
2095da4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
import json
from tqdm import tqdm
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
for i in tqdm(range(298)):
with open(f'wikipedia_json_64_filtered/wikipedia.segmented.nltk.split.seq64.{i}.json', 'r') as f:
rows = json.load(f)
tokens = [row['gpt2_token'] for row in rows]
texts = tokenizer.batch_decode(tokens)
with open(f'wikipedia/{i}.txt', 'w') as f:
for txt in texts:
f.write(txt.strip() + '\n')
|