pile-v2-eda / load_dataset.py
Reshinth Adithyan
Remove single datapoint datasets
9c88e2b
raw
history blame
533 Bytes
# import datasets
# import logging
import os
import json
# from tqdm import tqdm
# dataset_subs = os.listdir(PATH)
# print(dataset_subs)
# for ds in tqdm(dataset_subs):
# try:
# print(ds)
# dataset = datasets.load_dataset("CarperAI/pile-v2-small-filtered",data_files=f"data/{ds}/data.json", split="train")
# dataset.save_to_disk(f"cache_ds/{ds}")
# except:
# print(f"Error at {ds}")
ds_subsets = os.listdir("cache_ds")
with open("documentation.json","w") as f:
json.dump(ds_subsets,f)