Spaces:
Paused
Paused
Added english tasks
Browse files- custom_tasks.py +3 -1
- requirements.txt +1 -1
- src/about.py +1 -1
- src/backend/run_eval_suite_lighteval.py +2 -2
- src/custom_tasks/arc_challenge_task.py +24 -0
- src/custom_tasks/commonsense_task.py +31 -0
- src/custom_tasks/heq_task.py +1 -2
- src/custom_tasks/sentiment_task.py +1 -2
- src/custom_tasks/winograd_task.py +1 -2
custom_tasks.py
CHANGED
@@ -9,11 +9,13 @@ Author:
|
|
9 |
from src.custom_tasks.heq_task import *
|
10 |
from src.custom_tasks.sentiment_task import *
|
11 |
from src.custom_tasks.winograd_task import *
|
|
|
|
|
12 |
|
13 |
## MODULE LOGIC
|
14 |
# You should not need to touch this
|
15 |
# Convert to dict for lighteval
|
16 |
-
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
|
17 |
|
18 |
if __name__ == "__main__":
|
19 |
print(t["name"] for t in TASKS_TABLE)
|
|
|
9 |
from src.custom_tasks.heq_task import *
|
10 |
from src.custom_tasks.sentiment_task import *
|
11 |
from src.custom_tasks.winograd_task import *
|
12 |
+
from src.custom_tasks.commonsense_task import *
|
13 |
+
from src.custom_tasks.arc_challenge_task import *
|
14 |
|
15 |
## MODULE LOGIC
|
16 |
# You should not need to touch this
|
17 |
# Convert to dict for lighteval
|
18 |
+
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, commonsense_qa_task, arc_challenge_task]]
|
19 |
|
20 |
if __name__ == "__main__":
|
21 |
print(t["name"] for t in TASKS_TABLE)
|
requirements.txt
CHANGED
@@ -14,7 +14,7 @@ tqdm==4.65.0
|
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
# git+https://github.com/huggingface/lighteval.git#egg=lighteval
|
17 |
-
git+https://github.com/shaltielshmid/lighteval.git@
|
18 |
accelerate==0.24.1
|
19 |
sentencepiece
|
20 |
Levenshtein
|
|
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
# git+https://github.com/huggingface/lighteval.git#egg=lighteval
|
17 |
+
git+https://github.com/shaltielshmid/lighteval.git@fix-greedy-generate-bugs#egg=lighteval
|
18 |
accelerate==0.24.1
|
19 |
sentencepiece
|
20 |
Levenshtein
|
src/about.py
CHANGED
@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
-
tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
|
25 |
TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
|
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
+
tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'arc:challenge']
|
25 |
TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
|
src/backend/run_eval_suite_lighteval.py
CHANGED
@@ -22,6 +22,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
22 |
|
23 |
args = DefaultNamespace(**{
|
24 |
"endpoint_model_name": eval_request.model,
|
|
|
25 |
"accelerator": accelerator,
|
26 |
"vendor": vendor,
|
27 |
"region": region,
|
@@ -41,8 +42,7 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
41 |
"tasks": task_names,
|
42 |
"dataset_loading_processes": 24,
|
43 |
"num_fewshot_seeds": 0,
|
44 |
-
"reuse_existing": False
|
45 |
-
"model_info": ModelInfo(eval_request.model, eval_request.revision, eval_request.precision, eval_request.params),
|
46 |
})
|
47 |
|
48 |
try:
|
|
|
22 |
|
23 |
args = DefaultNamespace(**{
|
24 |
"endpoint_model_name": eval_request.model,
|
25 |
+
"model_dtype": eval_request.precision,
|
26 |
"accelerator": accelerator,
|
27 |
"vendor": vendor,
|
28 |
"region": region,
|
|
|
42 |
"tasks": task_names,
|
43 |
"dataset_loading_processes": 24,
|
44 |
"num_fewshot_seeds": 0,
|
45 |
+
"reuse_existing": False
|
|
|
46 |
})
|
47 |
|
48 |
try:
|
src/custom_tasks/arc_challenge_task.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
+
from lighteval.metrics import Metrics, MetricCategory
|
5 |
+
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
+
from aenum import extend_enum
|
7 |
+
import numpy as np
|
8 |
+
from lighteval.tasks.requests import Doc
|
9 |
+
from Levenshtein import distance
|
10 |
+
import collections
|
11 |
+
from lighteval.utils import as_list
|
12 |
+
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
13 |
+
|
14 |
+
arc_challenge_task = LightevalTaskConfig(
|
15 |
+
name="arc:challenge",
|
16 |
+
prompt_function="arc",
|
17 |
+
hf_repo="ai2_arc",
|
18 |
+
hf_subset="ARC-Challenge",
|
19 |
+
evaluation_splits=["test"],
|
20 |
+
generation_size=1,
|
21 |
+
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
22 |
+
trust_dataset=True,
|
23 |
+
stop_sequence=["\n"],
|
24 |
+
)
|
src/custom_tasks/commonsense_task.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
+
from lighteval.metrics import Metrics, MetricCategory
|
5 |
+
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
+
from aenum import extend_enum
|
7 |
+
import numpy as np
|
8 |
+
from lighteval.tasks.requests import Doc
|
9 |
+
from Levenshtein import distance
|
10 |
+
import collections
|
11 |
+
from lighteval.utils import as_list
|
12 |
+
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
13 |
+
|
14 |
+
def commonsense_qa_prompt(line, task_name: str = None):
|
15 |
+
return Doc(
|
16 |
+
task_name=task_name,
|
17 |
+
query=line["question"],
|
18 |
+
choices=[f" {c}" for c in line["choices"]["text"]],
|
19 |
+
gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
|
20 |
+
instruction="",
|
21 |
+
)
|
22 |
+
|
23 |
+
commonsense_qa_task = LightevalTaskConfig(
|
24 |
+
name="commonsense_qa",
|
25 |
+
prompt_function="commonsense_qa_prompt",
|
26 |
+
hf_repo="commonsense_qa",
|
27 |
+
hf_subset="default",
|
28 |
+
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
29 |
+
trust_dataset=True,
|
30 |
+
stop_sequence=["\n"],
|
31 |
+
),
|
src/custom_tasks/heq_task.py
CHANGED
@@ -112,5 +112,4 @@ heq_task = LightevalTaskConfig(
|
|
112 |
metric=['heq_tlnls_metric'],
|
113 |
stop_sequence=['\n'],
|
114 |
generation_size=64
|
115 |
-
)
|
116 |
-
heq_task.stop_sequence = as_list(heq_task.stop_sequence)
|
|
|
112 |
metric=['heq_tlnls_metric'],
|
113 |
stop_sequence=['\n'],
|
114 |
generation_size=64
|
115 |
+
)
|
|
src/custom_tasks/sentiment_task.py
CHANGED
@@ -56,5 +56,4 @@ sentiment_task = LightevalTaskConfig(
|
|
56 |
metric=['sentiment_acc_metric'],
|
57 |
stop_sequence=['\n'],
|
58 |
generation_size=32
|
59 |
-
)
|
60 |
-
sentiment_task.stop_sequence = as_list(sentiment_task.stop_sequence)
|
|
|
56 |
metric=['sentiment_acc_metric'],
|
57 |
stop_sequence=['\n'],
|
58 |
generation_size=32
|
59 |
+
)
|
|
src/custom_tasks/winograd_task.py
CHANGED
@@ -53,5 +53,4 @@ winograd_task = LightevalTaskConfig(
|
|
53 |
metric=['winograd_acc_metric'],
|
54 |
stop_sequence=['\n'],
|
55 |
generation_size=32
|
56 |
-
)
|
57 |
-
winograd_task.stop_sequence = as_list(winograd_task.stop_sequence)
|
|
|
53 |
metric=['winograd_acc_metric'],
|
54 |
stop_sequence=['\n'],
|
55 |
generation_size=32
|
56 |
+
)
|
|