File size: 574 Bytes
fb238e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import sys
import os
from tqdm import tqdm
sys.path.append('../../')

if __name__ == '__main__':
    from data.fs_datasets import load_dataset
    dataset = load_dataset('wudao_180g', num_proc=100)
    print('dataset loaded', flush=True)

    shuffle_ds = dataset['train'].shuffle(seed=42, writer_batch_size=1000)
    print('dataset shuffled', flush=True)
    need_size = len(shuffle_ds)

    f = open('shuffle_corpus_{}.txt'.format(need_size), 'w', encoding='utf-8')
    for i in tqdm(range(0, need_size)):
        f.write(shuffle_ds[i]['text'] + os.linesep)
    f.close()