File size: 1,517 Bytes
894c4b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c79b12
894c4b4
 
 
 
 
 
90dff75
 
bcdca08
73d1e6e
bcdca08
 
 
73d1e6e
bcdca08
73d1e6e
 
 
bcdca08
73d1e6e
 
894c4b4
6c79b12
894c4b4
b1a5839
894c4b4
 
 
7e68bad
894c4b4
f9d415e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os

import torch

from dataclasses import dataclass
from enum import Enum

from src.envs import CACHE_PATH


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str
    num_fewshot: int


class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
    # task0 = Task("anli_r1", "acc", "ANLI")
    # task1 = Task("logiqa", "acc_norm", "LogiQA")
    task0 = Task("nq_open", "em", "NQ Open", 64)  # 64, as in the ATLAS paper
    task1 = Task("triviaqa", "em", "TriviaQA", 64)  # 64, as in the ATLAS paper
    # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf

    # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
    task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
    task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)

    task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
    # task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
    # task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)

    # task6 = Task("xsum", "rougeL_acc", "XSum", 8)

    task8 = Task("memo-trap", "acc", "memo-trap", 0)

# NUM_FEWSHOT = 64  # Change with your few shot


EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'

LIMIT = None  # Testing; needs to be None