facat commited on
Commit
0c75eca
1 Parent(s): 360e3ac
Files changed (2) hide show
  1. tasks.py +10 -25
  2. tlem.py +6 -8
tasks.py CHANGED
@@ -14,6 +14,7 @@ from .utils import *
14
  from evaluate import load
15
  from collections import defaultdict
16
  import sys
 
17
 
18
 
19
  # if sys.version_info >= (3, 9):
@@ -73,7 +74,7 @@ class Task:
73
  if isinstance(self.dataset_name, str)
74
  else list(self.dataset_name)
75
  )
76
- names[0] = names[0].split("/")[-1]
77
 
78
  self.name = "-".join(names) + f"-{self.split}"
79
  if isinstance(self.prompt, str):
@@ -652,31 +653,15 @@ Anawer:"""
652
  def suite(
653
  cls,
654
  ):
655
- finer_categories = (
656
- pd.Series(cls.categories) # noqa # type: ignore
657
- .explode()
658
- .reset_index()
659
- .set_index(0)
660
- .groupby(0)
661
- .agg(list)["index"]
662
- .to_dict()
663
  )
664
- suite = defaultdict(list)
665
- categories = list(finer_categories.keys())
666
- for cate in categories:
667
- suite[cate].append(
668
- Task(
669
- ("drop", cate),
670
- metric_name=("sustech/tlem", "drop"),
671
- input_column=cls.input_column,
672
- label_column=cls.label_column,
673
- prompt=partial(cls.prompt_drop),
674
- few_shot=0,
675
- split="validation",
676
- )
677
- )
678
-
679
- return suite
680
 
681
 
682
  class HellaSwag:
 
14
  from evaluate import load
15
  from collections import defaultdict
16
  import sys
17
+ from pathlib import Path
18
 
19
 
20
  # if sys.version_info >= (3, 9):
 
74
  if isinstance(self.dataset_name, str)
75
  else list(self.dataset_name)
76
  )
77
+ names[0] = Path(names[0]).name
78
 
79
  self.name = "-".join(names) + f"-{self.split}"
80
  if isinstance(self.prompt, str):
 
653
  def suite(
654
  cls,
655
  ):
656
+ return Task(
657
+ "drop",
658
+ metric_name=("sustech/tlem", "drop"),
659
+ input_column=cls.input_column,
660
+ label_column=cls.label_column,
661
+ prompt=partial(cls.prompt_drop),
662
+ few_shot=0,
663
+ split="validation",
664
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
 
666
 
667
  class HellaSwag:
tlem.py CHANGED
@@ -81,15 +81,12 @@ class Suite(EvaluationSuite):
81
  case slice() | int():
82
  return self.tasks[key]
83
 
84
- def aggregate(self, suite):
85
  for cate, tasks in suite.items():
86
  if isinstance(tasks, dict):
87
- suite[cate] = self.aggregate(tasks)
88
  else:
89
- result = []
90
- for task in tasks:
91
- result.extend(task.result.values())
92
- suite[cate] = np.mean(result)
93
 
94
  return suite
95
 
@@ -103,7 +100,8 @@ class Suite(EvaluationSuite):
103
  for task in (bar := tqdm(self.tasks)):
104
  bar.desc = f"complete {task.name}."
105
  _ = task.run(model_or_pipeline)
106
- return self.aggregate(deepcopy(self.suite))
 
107
 
108
  def arun(self, model_or_pipeline):
109
  async def sync_function():
@@ -113,7 +111,7 @@ class Suite(EvaluationSuite):
113
 
114
  asyncio.run(sync_function())
115
 
116
- return self.aggregate(deepcopy(self.suite))
117
 
118
  def get_suite(self, name) -> dict[str, Task]:
119
  chat = False
 
81
  case slice() | int():
82
  return self.tasks[key]
83
 
84
+ def agg(self, suite):
85
  for cate, tasks in suite.items():
86
  if isinstance(tasks, dict):
87
+ suite[cate] = self.agg(tasks)
88
  else:
89
+ suite[cate] = np.mean([pd.Series(task.result).mean() for task in tasks])
 
 
 
90
 
91
  return suite
92
 
 
100
  for task in (bar := tqdm(self.tasks)):
101
  bar.desc = f"complete {task.name}."
102
  _ = task.run(model_or_pipeline)
103
+ logging.info(f"{task.name} {task.result=}")
104
+ return self.agg(deepcopy(self.suite))
105
 
106
  def arun(self, model_or_pipeline):
107
  async def sync_function():
 
111
 
112
  asyncio.run(sync_function())
113
 
114
+ return self.agg(deepcopy(self.suite))
115
 
116
  def get_suite(self, name) -> dict[str, Task]:
117
  chat = False