gpt-fi / data /fine-tuning /online_reviews_loading.py
Vaino Hatanpaa
add training and evaluation scripts
ceedef8
from datasets import load_dataset
import datasets
import json
import numpy as np
import os
#Dataset loading script that is missing quite a lot of details but works
class NewDataset(datasets.GeneratorBasedBuilder):
def _info(self):
return datasets.DatasetInfo(
description="beep boop",
features=datasets.Features(
{
"description": datasets.Value("string"),
"text": datasets.Value("string"),
"rating": datasets.Value("int32")
}
),
# No default supervised_keys (as we have to pass both question
# and context as input).
supervised_keys=None,
homepage="no",
citation="no",
)
def _split_generators(self, dl_manager):
files = self.config.data_files
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files["train"]})]
def _generate_examples(
self, files # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
):
""" Yields examples as (key, example) tuples. """
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is here for legacy reason (tfds) and is not important in itself.
#print("files",files)
key = 0
for file in files:
with open(file, encoding="utf-8") as f:
data = json.load(f)
for item in data:
for review in item["reviews"]:
yield key, {
"description": item["description_raw"],
"text": review["reviewText"],
"rating": review["rating"],
}
key += 1