File size: 2,232 Bytes
14e4843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a89d71b
 
14e4843
a89d71b
 
14e4843
 
a89d71b
 
 
14e4843
a89d71b
 
 
14e4843
 
 
 
2d754ab
a89d71b
14e4843
a89d71b
 
14e4843
a89d71b
14e4843
aa83719
14e4843
 
 
 
a89d71b
14e4843
a89d71b
14e4843
 
a89d71b
14e4843
a89d71b
2d754ab
14e4843
 
 
 
 
 
d6d7ec6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os

import torch

from dataclasses import dataclass
from enum import Enum

from src.envs import CACHE_PATH


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str
    num_fewshot: int


class Tasks(Enum):
    # task0 = Task("nq_open", "em", "NQ Open", 64)  # 64, as in the ATLAS paper
    # task1 = Task("triviaqa", "em", "TriviaQA", 64)  # 64, as in the ATLAS paper

    # task11 = Task("nq8", "em", "NQ Open 8", 8)
    # task12 = Task("tqa8", "em", "TriviaQA 8", 8)

    # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
    # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
    # task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
    # task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)

    # task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
    # task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
    # task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)

    # task8 = Task("xsum", "rougeL", "XSum", 2)
    # task9 = Task("cnndm", "rougeL", "CNN/DM", 2)

    # task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
    # task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)

    # task10 = Task("memo-trap", "acc", "memo-trap", 0)
    # task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)

    # task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)

    task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)

    # task15 = Task("fever10", "acc", "FEVER", 16)
    # task15_1 = Task("fever11", "acc", "FEVER", 8)

    # task16 = Task("squadv2", "exact", "SQuADv2", 4)

    # task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)

    # task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
    # task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)

    # task20 = Task("race", "acc", "RACE", 0)
    task21 = Task("mmlu", "acc", "MMLU", 5)


EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"