from datasets import load_dataset import datasets import json import numpy as np import os #Dataset loading script that is missing quite a lot of details but works class NewDataset(datasets.GeneratorBasedBuilder): def _info(self): return datasets.DatasetInfo( description="beep boop", features=datasets.Features( { "description": datasets.Value("string"), "text": datasets.Value("string"), "rating": datasets.Value("int32") } ), # No default supervised_keys (as we have to pass both question # and context as input). supervised_keys=None, homepage="no", citation="no", ) def _split_generators(self, dl_manager): files = self.config.data_files return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files["train"]})] def _generate_examples( self, files # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` ): """ Yields examples as (key, example) tuples. """ # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is here for legacy reason (tfds) and is not important in itself. #print("files",files) key = 0 for file in files: with open(file, encoding="utf-8") as f: data = json.load(f) for item in data: for review in item["reviews"]: yield key, { "description": item["description_raw"], "text": review["reviewText"], "rating": review["rating"], } key += 1