Spaces:
Running
Running
clean
Browse files
tasks.py
CHANGED
@@ -14,6 +14,7 @@ from .utils import *
|
|
14 |
from evaluate import load
|
15 |
from collections import defaultdict
|
16 |
import sys
|
|
|
17 |
|
18 |
|
19 |
# if sys.version_info >= (3, 9):
|
@@ -73,7 +74,7 @@ class Task:
|
|
73 |
if isinstance(self.dataset_name, str)
|
74 |
else list(self.dataset_name)
|
75 |
)
|
76 |
-
names[0] = names[0].
|
77 |
|
78 |
self.name = "-".join(names) + f"-{self.split}"
|
79 |
if isinstance(self.prompt, str):
|
@@ -652,31 +653,15 @@ Anawer:"""
|
|
652 |
def suite(
|
653 |
cls,
|
654 |
):
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
.
|
659 |
-
.
|
660 |
-
.
|
661 |
-
|
662 |
-
|
663 |
)
|
664 |
-
suite = defaultdict(list)
|
665 |
-
categories = list(finer_categories.keys())
|
666 |
-
for cate in categories:
|
667 |
-
suite[cate].append(
|
668 |
-
Task(
|
669 |
-
("drop", cate),
|
670 |
-
metric_name=("sustech/tlem", "drop"),
|
671 |
-
input_column=cls.input_column,
|
672 |
-
label_column=cls.label_column,
|
673 |
-
prompt=partial(cls.prompt_drop),
|
674 |
-
few_shot=0,
|
675 |
-
split="validation",
|
676 |
-
)
|
677 |
-
)
|
678 |
-
|
679 |
-
return suite
|
680 |
|
681 |
|
682 |
class HellaSwag:
|
|
|
14 |
from evaluate import load
|
15 |
from collections import defaultdict
|
16 |
import sys
|
17 |
+
from pathlib import Path
|
18 |
|
19 |
|
20 |
# if sys.version_info >= (3, 9):
|
|
|
74 |
if isinstance(self.dataset_name, str)
|
75 |
else list(self.dataset_name)
|
76 |
)
|
77 |
+
names[0] = Path(names[0]).name
|
78 |
|
79 |
self.name = "-".join(names) + f"-{self.split}"
|
80 |
if isinstance(self.prompt, str):
|
|
|
653 |
def suite(
|
654 |
cls,
|
655 |
):
|
656 |
+
return Task(
|
657 |
+
"drop",
|
658 |
+
metric_name=("sustech/tlem", "drop"),
|
659 |
+
input_column=cls.input_column,
|
660 |
+
label_column=cls.label_column,
|
661 |
+
prompt=partial(cls.prompt_drop),
|
662 |
+
few_shot=0,
|
663 |
+
split="validation",
|
664 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
665 |
|
666 |
|
667 |
class HellaSwag:
|
tlem.py
CHANGED
@@ -81,15 +81,12 @@ class Suite(EvaluationSuite):
|
|
81 |
case slice() | int():
|
82 |
return self.tasks[key]
|
83 |
|
84 |
-
def
|
85 |
for cate, tasks in suite.items():
|
86 |
if isinstance(tasks, dict):
|
87 |
-
suite[cate] = self.
|
88 |
else:
|
89 |
-
|
90 |
-
for task in tasks:
|
91 |
-
result.extend(task.result.values())
|
92 |
-
suite[cate] = np.mean(result)
|
93 |
|
94 |
return suite
|
95 |
|
@@ -103,7 +100,8 @@ class Suite(EvaluationSuite):
|
|
103 |
for task in (bar := tqdm(self.tasks)):
|
104 |
bar.desc = f"complete {task.name}."
|
105 |
_ = task.run(model_or_pipeline)
|
106 |
-
|
|
|
107 |
|
108 |
def arun(self, model_or_pipeline):
|
109 |
async def sync_function():
|
@@ -113,7 +111,7 @@ class Suite(EvaluationSuite):
|
|
113 |
|
114 |
asyncio.run(sync_function())
|
115 |
|
116 |
-
return self.
|
117 |
|
118 |
def get_suite(self, name) -> dict[str, Task]:
|
119 |
chat = False
|
|
|
81 |
case slice() | int():
|
82 |
return self.tasks[key]
|
83 |
|
84 |
+
def agg(self, suite):
|
85 |
for cate, tasks in suite.items():
|
86 |
if isinstance(tasks, dict):
|
87 |
+
suite[cate] = self.agg(tasks)
|
88 |
else:
|
89 |
+
suite[cate] = np.mean([pd.Series(task.result).mean() for task in tasks])
|
|
|
|
|
|
|
90 |
|
91 |
return suite
|
92 |
|
|
|
100 |
for task in (bar := tqdm(self.tasks)):
|
101 |
bar.desc = f"complete {task.name}."
|
102 |
_ = task.run(model_or_pipeline)
|
103 |
+
logging.info(f"{task.name} {task.result=}")
|
104 |
+
return self.agg(deepcopy(self.suite))
|
105 |
|
106 |
def arun(self, model_or_pipeline):
|
107 |
async def sync_function():
|
|
|
111 |
|
112 |
asyncio.run(sync_function())
|
113 |
|
114 |
+
return self.agg(deepcopy(self.suite))
|
115 |
|
116 |
def get_suite(self, name) -> dict[str, Task]:
|
117 |
chat = False
|