Shaltiel commited on
Commit
adf0b2e
1 Parent(s): fe281bf

Fixed whitespace for prediction

Browse files
custom_tasks.py CHANGED
@@ -9,13 +9,11 @@ Author:
9
  from src.custom_tasks.heq_task import *
10
  from src.custom_tasks.sentiment_task import *
11
  from src.custom_tasks.winograd_task import *
12
- from src.custom_tasks.commonsense_task import *
13
- from src.custom_tasks.arc_challenge_task import *
14
 
15
  ## MODULE LOGIC
16
  # You should not need to touch this
17
  # Convert to dict for lighteval
18
- TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, commonsense_qa_task, arc_challenge_task]]
19
 
20
  if __name__ == "__main__":
21
  print(t["name"] for t in TASKS_TABLE)
 
9
  from src.custom_tasks.heq_task import *
10
  from src.custom_tasks.sentiment_task import *
11
  from src.custom_tasks.winograd_task import *
 
 
12
 
13
  ## MODULE LOGIC
14
  # You should not need to touch this
15
  # Convert to dict for lighteval
16
+ TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
17
 
18
  if __name__ == "__main__":
19
  print(t["name"] for t in TASKS_TABLE)
src/about.py CHANGED
@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
- tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'arc:challenge']
25
- TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
 
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
+ tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
25
+ TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'
src/custom_tasks/arc_challenge_task.py DELETED
@@ -1,24 +0,0 @@
1
- import re
2
- import string
3
- from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
- from lighteval.metrics import Metrics, MetricCategory
5
- from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
- from aenum import extend_enum
7
- import numpy as np
8
- from lighteval.tasks.requests import Doc
9
- from Levenshtein import distance
10
- import collections
11
- from lighteval.utils import as_list
12
- from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
13
-
14
- arc_challenge_task = LightevalTaskConfig(
15
- name="arc:challenge",
16
- prompt_function="arc",
17
- hf_repo="ai2_arc",
18
- hf_subset="ARC-Challenge",
19
- evaluation_splits=["test"],
20
- generation_size=1,
21
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
22
- trust_dataset=True,
23
- stop_sequence=["\n"],
24
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/custom_tasks/commonsense_task.py DELETED
@@ -1,31 +0,0 @@
1
- import re
2
- import string
3
- from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
- from lighteval.metrics import Metrics, MetricCategory
5
- from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
- from aenum import extend_enum
7
- import numpy as np
8
- from lighteval.tasks.requests import Doc
9
- from Levenshtein import distance
10
- import collections
11
- from lighteval.utils import as_list
12
- from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
13
-
14
- def commonsense_qa_prompt(line, task_name: str = None):
15
- return Doc(
16
- task_name=task_name,
17
- query=line["question"],
18
- choices=[f" {c}" for c in line["choices"]["text"]],
19
- gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
20
- instruction="",
21
- )
22
-
23
- commonsense_qa_task = LightevalTaskConfig(
24
- name="commonsense_qa",
25
- prompt_function="commonsense_qa_prompt",
26
- hf_repo="commonsense_qa",
27
- hf_subset="default",
28
- metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
29
- trust_dataset=True,
30
- stop_sequence=["\n"],
31
- ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/custom_tasks/heq_task.py CHANGED
@@ -73,7 +73,7 @@ def tlnls(a_gold, a_pred):
73
  def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
74
  if len(predictions) > 1:
75
  raise ValueError("Predictions should have one item")
76
- pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
77
  return max([tlnls(x, pred) for x in golds])
78
 
79
  heq_tlnls_metric = CorpusLevelMetric(
@@ -93,8 +93,8 @@ def heq_prompt_fn(line, task_name: str = None):
93
  """
94
  return Doc(
95
  task_name=task_name,
96
- query=line["prompt"],
97
- choices=line["response"],
98
  gold_index=list(range(len(line["response"]))),
99
  instruction="",
100
  )
 
73
  def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
74
  if len(predictions) > 1:
75
  raise ValueError("Predictions should have one item")
76
+ pred = re.sub('<[^>]+>', '', predictions[0]).strip() # remove xml tags
77
  return max([tlnls(x, pred) for x in golds])
78
 
79
  heq_tlnls_metric = CorpusLevelMetric(
 
93
  """
94
  return Doc(
95
  task_name=task_name,
96
+ query=line["prompt"].strip(),
97
+ choices=[resp.strip() for resp in line["response"]],
98
  gold_index=list(range(len(line["response"]))),
99
  instruction="",
100
  )
src/custom_tasks/sentiment_task.py CHANGED
@@ -37,8 +37,8 @@ def sentiment_prompt_fn(line, task_name: str = None):
37
  """
38
  return Doc(
39
  task_name=task_name,
40
- query=line["prompt"],
41
- choices=line["response"],
42
  gold_index=0,
43
  instruction="",
44
  )
 
37
  """
38
  return Doc(
39
  task_name=task_name,
40
+ query=line["prompt"].strip(),
41
+ choices=[resp.strip() for resp in line["response"]],
42
  gold_index=0,
43
  instruction="",
44
  )
src/custom_tasks/winograd_task.py CHANGED
@@ -34,8 +34,8 @@ def winograd_prompt_fn(line, task_name: str = None):
34
  """
35
  return Doc(
36
  task_name=task_name,
37
- query=line["prompt"],
38
- choices=line["response"],
39
  gold_index=0,
40
  instruction="",
41
  )
 
34
  """
35
  return Doc(
36
  task_name=task_name,
37
+ query=line["prompt"].strip(),
38
+ choices=[resp.strip() for resp in line["response"]],
39
  gold_index=0,
40
  instruction="",
41
  )