Shaltiel commited on
Commit
fe281bf
1 Parent(s): 45e5a75

Added english tasks

Browse files
custom_tasks.py CHANGED
@@ -9,11 +9,13 @@ Author:
9
  from src.custom_tasks.heq_task import *
10
  from src.custom_tasks.sentiment_task import *
11
  from src.custom_tasks.winograd_task import *
 
 
12
 
13
  ## MODULE LOGIC
14
  # You should not need to touch this
15
  # Convert to dict for lighteval
16
- TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
17
 
18
  if __name__ == "__main__":
19
  print(t["name"] for t in TASKS_TABLE)
 
9
  from src.custom_tasks.heq_task import *
10
  from src.custom_tasks.sentiment_task import *
11
  from src.custom_tasks.winograd_task import *
12
+ from src.custom_tasks.commonsense_task import *
13
+ from src.custom_tasks.arc_challenge_task import *
14
 
15
  ## MODULE LOGIC
16
  # You should not need to touch this
17
  # Convert to dict for lighteval
18
+ TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, commonsense_qa_task, arc_challenge_task]]
19
 
20
  if __name__ == "__main__":
21
  print(t["name"] for t in TASKS_TABLE)
requirements.txt CHANGED
@@ -14,7 +14,7 @@ tqdm==4.65.0
14
  transformers
15
  tokenizers>=0.15.0
16
  # git+https://github.com/huggingface/lighteval.git#egg=lighteval
17
- git+https://github.com/shaltielshmid/lighteval.git@temp-for-heb-leaderboard#egg=lighteval
18
  accelerate==0.24.1
19
  sentencepiece
20
  Levenshtein
 
14
  transformers
15
  tokenizers>=0.15.0
16
  # git+https://github.com/huggingface/lighteval.git#egg=lighteval
17
+ git+https://github.com/shaltielshmid/lighteval.git@fix-greedy-generate-bugs#egg=lighteval
18
  accelerate==0.24.1
19
  sentencepiece
20
  Levenshtein
src/about.py CHANGED
@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
- tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
25
  TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
 
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
+ tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'arc:challenge']
25
  TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
src/backend/run_eval_suite_lighteval.py CHANGED
@@ -22,6 +22,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
22
 
23
  args = DefaultNamespace(**{
24
  "endpoint_model_name": eval_request.model,
 
25
  "accelerator": accelerator,
26
  "vendor": vendor,
27
  "region": region,
@@ -41,8 +42,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
41
  "tasks": task_names,
42
  "dataset_loading_processes": 24,
43
  "num_fewshot_seeds": 0,
44
- "reuse_existing": False,
45
- "model_info": ModelInfo(eval_request.model, eval_request.revision, eval_request.precision, eval_request.params),
46
  })
47
 
48
  try:
 
22
 
23
  args = DefaultNamespace(**{
24
  "endpoint_model_name": eval_request.model,
25
+ "model_dtype": eval_request.precision,
26
  "accelerator": accelerator,
27
  "vendor": vendor,
28
  "region": region,
 
42
  "tasks": task_names,
43
  "dataset_loading_processes": 24,
44
  "num_fewshot_seeds": 0,
45
+ "reuse_existing": False
 
46
  })
47
 
48
  try:
src/custom_tasks/arc_challenge_task.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
+ from lighteval.metrics import Metrics, MetricCategory
5
+ from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
+ from aenum import extend_enum
7
+ import numpy as np
8
+ from lighteval.tasks.requests import Doc
9
+ from Levenshtein import distance
10
+ import collections
11
+ from lighteval.utils import as_list
12
+ from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
13
+
14
+ arc_challenge_task = LightevalTaskConfig(
15
+ name="arc:challenge",
16
+ prompt_function="arc",
17
+ hf_repo="ai2_arc",
18
+ hf_subset="ARC-Challenge",
19
+ evaluation_splits=["test"],
20
+ generation_size=1,
21
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
22
+ trust_dataset=True,
23
+ stop_sequence=["\n"],
24
+ )
src/custom_tasks/commonsense_task.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
+ from lighteval.metrics import Metrics, MetricCategory
5
+ from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
+ from aenum import extend_enum
7
+ import numpy as np
8
+ from lighteval.tasks.requests import Doc
9
+ from Levenshtein import distance
10
+ import collections
11
+ from lighteval.utils import as_list
12
+ from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
13
+
14
+ def commonsense_qa_prompt(line, task_name: str = None):
15
+ return Doc(
16
+ task_name=task_name,
17
+ query=line["question"],
18
+ choices=[f" {c}" for c in line["choices"]["text"]],
19
+ gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
20
+ instruction="",
21
+ )
22
+
23
+ commonsense_qa_task = LightevalTaskConfig(
24
+ name="commonsense_qa",
25
+ prompt_function="commonsense_qa_prompt",
26
+ hf_repo="commonsense_qa",
27
+ hf_subset="default",
28
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
29
+ trust_dataset=True,
30
+ stop_sequence=["\n"],
31
+ ),
src/custom_tasks/heq_task.py CHANGED
@@ -112,5 +112,4 @@ heq_task = LightevalTaskConfig(
112
  metric=['heq_tlnls_metric'],
113
  stop_sequence=['\n'],
114
  generation_size=64
115
- )
116
- heq_task.stop_sequence = as_list(heq_task.stop_sequence)
 
112
  metric=['heq_tlnls_metric'],
113
  stop_sequence=['\n'],
114
  generation_size=64
115
+ )
 
src/custom_tasks/sentiment_task.py CHANGED
@@ -56,5 +56,4 @@ sentiment_task = LightevalTaskConfig(
56
  metric=['sentiment_acc_metric'],
57
  stop_sequence=['\n'],
58
  generation_size=32
59
- )
60
- sentiment_task.stop_sequence = as_list(sentiment_task.stop_sequence)
 
56
  metric=['sentiment_acc_metric'],
57
  stop_sequence=['\n'],
58
  generation_size=32
59
+ )
 
src/custom_tasks/winograd_task.py CHANGED
@@ -53,5 +53,4 @@ winograd_task = LightevalTaskConfig(
53
  metric=['winograd_acc_metric'],
54
  stop_sequence=['\n'],
55
  generation_size=32
56
- )
57
- winograd_task.stop_sequence = as_list(winograd_task.stop_sequence)
 
53
  metric=['winograd_acc_metric'],
54
  stop_sequence=['\n'],
55
  generation_size=32
56
+ )