future-xy commited on
Commit
a89d71b
1 Parent(s): 998f2a6

keep gsm8k and XSum/ROUGE

Browse files
Files changed (2) hide show
  1. src/backend/envs.py +20 -19
  2. src/display/utils.py +19 -18
src/backend/envs.py CHANGED
@@ -17,45 +17,46 @@ class Task:
17
 
18
 
19
  class Tasks(Enum):
20
- task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
21
- task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
22
 
23
- task11 = Task("nq8", "em", "NQ Open 8", 8)
24
- task12 = Task("tqa8", "em", "TriviaQA 8", 8)
25
 
26
  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
27
- task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
28
- task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
29
- task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
30
 
31
- task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
32
- task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
33
- task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
34
 
35
  # task8 = Task("xsum", "rougeL", "XSum", 2)
36
  # task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
37
 
38
  task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
39
- task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
40
 
41
- task10 = Task("memo-trap", "acc", "memo-trap", 0)
42
- task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
43
 
44
- task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
45
 
46
- task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
47
 
48
  # task15 = Task("fever10", "acc", "FEVER", 16)
49
  # task15_1 = Task("fever11", "acc", "FEVER", 8)
50
 
51
- task16 = Task("squadv2", "exact", "SQuADv2", 4)
52
 
53
- task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)
54
 
55
  # task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
56
- task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
57
 
58
- task20 = Task("race", "acc", "RACE", 0)
 
59
 
60
 
61
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 
17
 
18
 
19
  class Tasks(Enum):
20
+ # task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
21
+ # task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
22
 
23
+ # task11 = Task("nq8", "em", "NQ Open 8", 8)
24
+ # task12 = Task("tqa8", "em", "TriviaQA 8", 8)
25
 
26
  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
27
+ # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
28
+ # task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
29
+ # task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
30
 
31
+ # task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
32
+ # task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
33
+ # task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
34
 
35
  # task8 = Task("xsum", "rougeL", "XSum", 2)
36
  # task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
37
 
38
  task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
39
+ # task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)
40
 
41
+ # task10 = Task("memo-trap", "acc", "memo-trap", 0)
42
+ # task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)
43
 
44
+ # task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
45
 
46
+ # task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
47
 
48
  # task15 = Task("fever10", "acc", "FEVER", 16)
49
  # task15_1 = Task("fever11", "acc", "FEVER", 8)
50
 
51
+ # task16 = Task("squadv2", "exact", "SQuADv2", 4)
52
 
53
+ # task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)
54
 
55
  # task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
56
+ # task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)
57
 
58
+ # task20 = Task("race", "acc", "RACE", 0)
59
+ task21 = Task("gsm8k", "acc", "GSM8K", 0)
60
 
61
 
62
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
src/display/utils.py CHANGED
@@ -20,32 +20,33 @@ class Tasks(Enum):
20
  # nqopen = Task("nq8", "em", "NQ Open/EM")
21
  # triviaqa = Task("tqa8", "em", "TriviaQA/EM")
22
 
23
- truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthQA MC1/Acc")
24
- truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc")
25
- truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE")
26
 
27
  xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE")
28
- xsum_f = Task("xsum_v2", "factKB", "XSum/factKB")
29
- xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P")
30
 
31
- cnndm_r = Task("cnndm_v2", "rougeL", "CNN-DM/ROUGE")
32
- cnndm_f = Task("cnndm_v2", "factKB", "CNN-DM/factKB")
33
- cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
34
 
35
- race = Task("race", "acc", "RACE/Acc")
36
- squadv2 = Task("squadv2", "exact", "SQUaDv2/EM")
37
 
38
- memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
39
- ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
40
 
41
- faithdial = Task("faithdial_hallu_v2", "acc", "FaithDial/Acc")
42
 
43
- halueval_qa = Task("halueval_qa", "acc", "HaluQA/Acc")
44
- halueval_summ = Task("halueval_summarization", "acc", "HaluSumm/Acc")
45
- halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
46
 
47
- # XXX include me back at some point
48
- selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
 
49
 
50
 
51
  # These classes are for user facing column names,
 
20
  # nqopen = Task("nq8", "em", "NQ Open/EM")
21
  # triviaqa = Task("tqa8", "em", "TriviaQA/EM")
22
 
23
+ # truthfulqa_mc1 = Task("truthfulqa_mc1", "acc", "TruthQA MC1/Acc")
24
+ # truthfulqa_mc2 = Task("truthfulqa_mc2", "acc", "TruthQA MC2/Acc")
25
+ # truthfulqa_gen = Task("truthfulqa_gen", "rougeL_acc", "TruthQA Gen/ROUGE")
26
 
27
  xsum_r = Task("xsum_v2", "rougeL", "XSum/ROUGE")
28
+ # xsum_f = Task("xsum_v2", "factKB", "XSum/factKB")
29
+ # xsum_b = Task("xsum_v2", "bertscore_precision", "XSum/BERT-P")
30
 
31
+ # cnndm_r = Task("cnndm_v2", "rougeL", "CNN-DM/ROUGE")
32
+ # cnndm_f = Task("cnndm_v2", "factKB", "CNN-DM/factKB")
33
+ # cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
34
 
35
+ # race = Task("race", "acc", "RACE/Acc")
36
+ # squadv2 = Task("squadv2", "exact", "SQUaDv2/EM")
37
 
38
+ # memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
39
+ # ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
40
 
41
+ # faithdial = Task("faithdial_hallu_v2", "acc", "FaithDial/Acc")
42
 
43
+ # halueval_qa = Task("halueval_qa", "acc", "HaluQA/Acc")
44
+ # halueval_summ = Task("halueval_summarization", "acc", "HaluSumm/Acc")
45
+ # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
46
 
47
+ # # XXX include me back at some point
48
+ # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
49
+ gsm8k = Task("gsm8k", "acc", "GSM8K")
50
 
51
 
52
  # These classes are for user facing column names,