FIX: extraction func of C-Eval; logging metrics

#3
by Cookize - opened
Files changed (2) hide show
  1. tasks.py +11 -9
  2. tlem.py +2 -2
tasks.py CHANGED
@@ -149,14 +149,15 @@ class Task:
149
  return
150
  self.outputs = outputs
151
  try:
152
- result = self.metric._compute(
153
- responses=outputs, references=self.dataset[self.label_column]
154
- )
 
 
 
 
 
155
  except Exception as e:
156
- result = self.metric.compute(
157
- responses=outputs, references=self.dataset[self.label_column]
158
- )
159
- finally:
160
  result = outputs
161
  # if log:
162
  # name = name or pipeline.__name__
@@ -188,7 +189,7 @@ class Metrics:
188
  mmlu = multichoice
189
 
190
  def ceval(responses: list[str], answers: list[str | int]):
191
- responses = [first_capital_postprocess(pred) for pred in responses]
192
  return responses, answers
193
 
194
  def winogrande(responses: list[str], answers: list[str | int]):
@@ -892,7 +893,7 @@ class CEVAL:
892
  prefix = (
893
  f"以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n"
894
  if chat
895
- else "问题"
896
  )
897
 
898
  prompt = prefix + f'{example["question"]}'
@@ -1043,6 +1044,7 @@ class CEVAL:
1043
  suite = defaultdict(list)
1044
  cls.categories = defaultdict(list)
1045
  for task, info in cls.ceval_subject_mapping.items():
 
1046
  cls.categories[info[2]].append(task)
1047
  cls.categories["all"] = list(cls.ceval_subject_mapping.keys())
1048
  for k, v in cls.categories.items():
 
149
  return
150
  self.outputs = outputs
151
  try:
152
+ try:
153
+ result = self.metric._compute(
154
+ responses=outputs, references=self.dataset[self.label_column]
155
+ )
156
+ except Exception as e:
157
+ result = self.metric.compute(
158
+ responses=outputs, references=self.dataset[self.label_column]
159
+ )
160
  except Exception as e:
 
 
 
 
161
  result = outputs
162
  # if log:
163
  # name = name or pipeline.__name__
 
189
  mmlu = multichoice
190
 
191
  def ceval(responses: list[str], answers: list[str | int]):
192
+ responses = [extract_choice_zh(pred) for pred in responses]
193
  return responses, answers
194
 
195
  def winogrande(responses: list[str], answers: list[str | int]):
 
893
  prefix = (
894
  f"以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n"
895
  if chat
896
+ else "问题:"
897
  )
898
 
899
  prompt = prefix + f'{example["question"]}'
 
1044
  suite = defaultdict(list)
1045
  cls.categories = defaultdict(list)
1046
  for task, info in cls.ceval_subject_mapping.items():
1047
+ cls.categories[info[0]].append(task)
1048
  cls.categories[info[2]].append(task)
1049
  cls.categories["all"] = list(cls.ceval_subject_mapping.keys())
1050
  for k, v in cls.categories.items():
tlem.py CHANGED
@@ -58,7 +58,7 @@ class ReasoningMetric(evaluate.Metric):
58
  )
59
  df["extract_responses"] = extract_responses
60
  df["extract_references"] = extract_references
61
- print(df)
62
  results = {
63
  "Accuracy": (df["extract_references"] == df["extract_responses"])
64
  .astype(int)
@@ -139,7 +139,7 @@ class Suite(EvaluationSuite):
139
  case _ if "test" in name:
140
  suite = suite["Test"]
141
 
142
- self.suite = suite
143
 
144
  def __init__(self, name="tlem"):
145
  super().__init__(name)
 
58
  )
59
  df["extract_responses"] = extract_responses
60
  df["extract_references"] = extract_references
61
+ # print(df)
62
  results = {
63
  "Accuracy": (df["extract_references"] == df["extract_responses"])
64
  .astype(int)
 
139
  case _ if "test" in name:
140
  suite = suite["Test"]
141
 
142
+ self.suite = [suite] if isinstance(suite, Task) else suite
143
 
144
  def __init__(self, name="tlem"):
145
  super().__init__(name)