Spaces:
Runtime error
Runtime error
# grab a dataset, prove we can save it | |
from datasets import load_dataset | |
raw_datasets = load_dataset("allocine") | |
raw_datasets.save_to_disk("my-arrow-datasets") | |
# load dataset from disk - prove we can reload it | |
from datasets import load_from_disk | |
arrow_datasets_reloaded = load_from_disk("my-arrow-datasets") | |
arrow_datasets_reloaded | |
#dataset_dict.save_to_disk("../data/wikipedia_rank_nocache") | |
raw_datasets.save_to_disk("../data/awacke1=allocine") | |
#prove the cache | |
arrow_datasets_reloaded.cache_files | |
# prove we can save in CSV | |
for split, dataset in raw_datasets.items(): | |
dataset.to_csv(f"my-dataset-{split}.csv", index=None) | |
data_files = { | |
"train": "my-dataset-train.csv", | |
"validation": "my-dataset-validation.csv", | |
"test": "my-dataset-test.csv", | |
} | |
csv_datasets_reloaded = load_dataset("csv", data_files=data_files) | |
csv_datasets_reloaded | |
# prove we can save in JSON | |
for split, dataset in raw_datasets.items(): | |
dataset.to_json(f"my-dataset-{split}.jsonl") | |
json_data_files = { | |
"train": "my-dataset-train.jsonl", | |
"validation": "my-dataset-validation.jsonl", | |
"test": "my-dataset-test.jsonl", | |
} | |
json_datasets_reloaded = load_dataset("json", data_files=json_data_files) | |
json_datasets_reloaded | |
# prove we can save in Parquet | |
for split, dataset in raw_datasets.items(): | |
dataset.to_parquet(f"my-dataset-{split}.parquet") | |
parquet_data_files = { | |
"train": "my-dataset-train.parquet", | |
"validation": "my-dataset-validation.parquet", | |
"test": "my-dataset-test.parquet", | |
} | |
parquet_datasets_reloaded = load_dataset("parquet", data_files=parquet_data_files) | |
parquet_datasets_reloaded | |
# prove we can save and load public local dataset on huggingface spaces | |
raw_datasets.save_to_disk("awacke1/my-arrow-datasets") | |
arrow_datasets_reloaded = load_from_disk("awacke1/my-arrow-datasets") | |
thisworked="Yes really worked" | |
arrow_datasets_reloaded | |
thisworked | |
#awacke1_public_datasets = load_dataset("awacke1/my-arrow-datasets") | |
#awacke1_public_datasets | |