Spaces:
Running
Running
FIX: extraction func of C-Eval; logging metrics
#3
by
Cookize
- opened
tasks.py
CHANGED
@@ -149,14 +149,15 @@ class Task:
|
|
149 |
return
|
150 |
self.outputs = outputs
|
151 |
try:
|
152 |
-
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
155 |
except Exception as e:
|
156 |
-
result = self.metric.compute(
|
157 |
-
responses=outputs, references=self.dataset[self.label_column]
|
158 |
-
)
|
159 |
-
finally:
|
160 |
result = outputs
|
161 |
# if log:
|
162 |
# name = name or pipeline.__name__
|
@@ -188,7 +189,7 @@ class Metrics:
|
|
188 |
mmlu = multichoice
|
189 |
|
190 |
def ceval(responses: list[str], answers: list[str | int]):
|
191 |
-
responses = [
|
192 |
return responses, answers
|
193 |
|
194 |
def winogrande(responses: list[str], answers: list[str | int]):
|
@@ -892,7 +893,7 @@ class CEVAL:
|
|
892 |
prefix = (
|
893 |
f"以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n"
|
894 |
if chat
|
895 |
-
else "
|
896 |
)
|
897 |
|
898 |
prompt = prefix + f'{example["question"]}'
|
@@ -1043,6 +1044,7 @@ class CEVAL:
|
|
1043 |
suite = defaultdict(list)
|
1044 |
cls.categories = defaultdict(list)
|
1045 |
for task, info in cls.ceval_subject_mapping.items():
|
|
|
1046 |
cls.categories[info[2]].append(task)
|
1047 |
cls.categories["all"] = list(cls.ceval_subject_mapping.keys())
|
1048 |
for k, v in cls.categories.items():
|
|
|
149 |
return
|
150 |
self.outputs = outputs
|
151 |
try:
|
152 |
+
try:
|
153 |
+
result = self.metric._compute(
|
154 |
+
responses=outputs, references=self.dataset[self.label_column]
|
155 |
+
)
|
156 |
+
except Exception as e:
|
157 |
+
result = self.metric.compute(
|
158 |
+
responses=outputs, references=self.dataset[self.label_column]
|
159 |
+
)
|
160 |
except Exception as e:
|
|
|
|
|
|
|
|
|
161 |
result = outputs
|
162 |
# if log:
|
163 |
# name = name or pipeline.__name__
|
|
|
189 |
mmlu = multichoice
|
190 |
|
191 |
def ceval(responses: list[str], answers: list[str | int]):
|
192 |
+
responses = [extract_choice_zh(pred) for pred in responses]
|
193 |
return responses, answers
|
194 |
|
195 |
def winogrande(responses: list[str], answers: list[str | int]):
|
|
|
893 |
prefix = (
|
894 |
f"以下是中国关于{_ch_name}考试的单项选择题,请选出其中的正确答案。\n"
|
895 |
if chat
|
896 |
+
else "问题:"
|
897 |
)
|
898 |
|
899 |
prompt = prefix + f'{example["question"]}'
|
|
|
1044 |
suite = defaultdict(list)
|
1045 |
cls.categories = defaultdict(list)
|
1046 |
for task, info in cls.ceval_subject_mapping.items():
|
1047 |
+
cls.categories[info[0]].append(task)
|
1048 |
cls.categories[info[2]].append(task)
|
1049 |
cls.categories["all"] = list(cls.ceval_subject_mapping.keys())
|
1050 |
for k, v in cls.categories.items():
|
tlem.py
CHANGED
@@ -58,7 +58,7 @@ class ReasoningMetric(evaluate.Metric):
|
|
58 |
)
|
59 |
df["extract_responses"] = extract_responses
|
60 |
df["extract_references"] = extract_references
|
61 |
-
print(df)
|
62 |
results = {
|
63 |
"Accuracy": (df["extract_references"] == df["extract_responses"])
|
64 |
.astype(int)
|
@@ -139,7 +139,7 @@ class Suite(EvaluationSuite):
|
|
139 |
case _ if "test" in name:
|
140 |
suite = suite["Test"]
|
141 |
|
142 |
-
self.suite = suite
|
143 |
|
144 |
def __init__(self, name="tlem"):
|
145 |
super().__init__(name)
|
|
|
58 |
)
|
59 |
df["extract_responses"] = extract_responses
|
60 |
df["extract_references"] = extract_references
|
61 |
+
# print(df)
|
62 |
results = {
|
63 |
"Accuracy": (df["extract_references"] == df["extract_responses"])
|
64 |
.astype(int)
|
|
|
139 |
case _ if "test" in name:
|
140 |
suite = suite["Test"]
|
141 |
|
142 |
+
self.suite = [suite] if isinstance(suite, Task) else suite
|
143 |
|
144 |
def __init__(self, name="tlem"):
|
145 |
super().__init__(name)
|