File size: 490 Bytes
2095da4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import json
from tqdm import tqdm
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')

for i in tqdm(range(298)):

    with open(f'wikipedia_json_64_filtered/wikipedia.segmented.nltk.split.seq64.{i}.json', 'r') as f:
        rows = json.load(f)

    tokens = [row['gpt2_token'] for row in rows]
    texts = tokenizer.batch_decode(tokens)

    with open(f'wikipedia/{i}.txt', 'w') as f:
        for txt in texts:
            f.write(txt.strip() + '\n')