from datasets import load_dataset | |
import datasets | |
import json | |
import numpy as np | |
import os | |
#Dataset loading script that is missing quite a lot of details but works | |
class NewDataset(datasets.GeneratorBasedBuilder): | |
def _info(self): | |
return datasets.DatasetInfo( | |
description="beep boop", | |
features=datasets.Features( | |
{ | |
"description": datasets.Value("string"), | |
"text": datasets.Value("string"), | |
"rating": datasets.Value("int32") | |
} | |
), | |
# No default supervised_keys (as we have to pass both question | |
# and context as input). | |
supervised_keys=None, | |
homepage="no", | |
citation="no", | |
) | |
def _split_generators(self, dl_manager): | |
files = self.config.data_files | |
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files["train"]})] | |
def _generate_examples( | |
self, files # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` | |
): | |
""" Yields examples as (key, example) tuples. """ | |
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. | |
# The `key` is here for legacy reason (tfds) and is not important in itself. | |
#print("files",files) | |
key = 0 | |
for file in files: | |
with open(file, encoding="utf-8") as f: | |
data = json.load(f) | |
for item in data: | |
for review in item["reviews"]: | |
yield key, { | |
"description": item["description_raw"], | |
"text": review["reviewText"], | |
"rating": review["rating"], | |
} | |
key += 1 | |