Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

open-moe-llm-leaderboard / cli /halueval-upload-cli.py

future-xy

formatting code

d6d7ec6 9 months ago

2.33 kB

	#!/usr/bin/env python3

	import random
	import requests

	from datasets import load_dataset, Dataset, DatasetDict


	path = "pminervini/HaluEval"

	API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
	response = requests.get(API_URL)
	res_json = response.json()

	gold_splits = {"dialogue", "qa", "summarization", "general"}

	available_splits = {split["config"] for split in res_json["splits"]} if "splits" in res_json else set()

	name_to_ds = dict()

	for name in gold_splits:
	ds = load_dataset("json", data_files={"data": f"data/{name}_data.json"})
	name_to_ds[name] = ds
	# if name not in available_splits:
	ds.push_to_hub(path, config_name=name)


	def list_to_dict(lst: list) -> dict:
	res = dict()
	for entry in lst:
	for k, v in entry.items():
	if k not in res:
	res[k] = []
	res[k] += [v]
	return res


	for name in gold_splits - {"general"}:
	random.seed(42)
	ds = name_to_ds[name]
	new_entry_lst = []

	for entry in ds["data"]:
	is_hallucinated = random.random() > 0.5
	new_entry = None
	if name in {"qa"}:
	new_entry = {
	"knowledge": entry["knowledge"],
	"question": entry["question"],
	"answer": entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'],
	"hallucination": "yes" if is_hallucinated else "no",
	}
	if name in {"dialogue"}:
	new_entry = {
	"knowledge": entry["knowledge"],
	"dialogue_history": entry["dialogue_history"],
	"response": entry[f'{"hallucinated" if is_hallucinated else "right"}_response'],
	"hallucination": "yes" if is_hallucinated else "no",
	}
	if name in {"summarization"}:
	new_entry = {
	"document": entry["document"],
	"summary": entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'],
	"hallucination": "yes" if is_hallucinated else "no",
	}
	assert new_entry is not None
	new_entry_lst += [new_entry]
	new_ds_map = list_to_dict(new_entry_lst)
	new_ds = Dataset.from_dict(new_ds_map)
	new_dsd = DatasetDict({"data": new_ds})

	new_dsd.push_to_hub(path, config_name=f"{name}_samples")