Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

leaderboard / src /backend /envs.py

update

73d1e6e about 1 year ago

1.52 kB

	import os

	import torch

	from dataclasses import dataclass
	from enum import Enum

	from src.envs import CACHE_PATH


	@dataclass
	class Task:
	benchmark: str
	metric: str
	col_name: str
	num_fewshot: int


	class Tasks(Enum):
	# task_key in the json file, metric_key in the json file, name to display in the leaderboard
	# task0 = Task("anli_r1", "acc", "ANLI")
	# task1 = Task("logiqa", "acc_norm", "LogiQA")
	task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
	task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
	# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf

	# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
	task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
	task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)

	task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
	# task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
	# task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)

	# task6 = Task("xsum", "rougeL_acc", "XSum", 8)

	task8 = Task("memo-trap", "acc", "memo-trap", 0)

	# NUM_FEWSHOT = 64 # Change with your few shot


	EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
	EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

	DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'

	LIMIT = None # Testing; needs to be None