Spaces:
Running
Running
update
Browse files- .gitignore +3 -1
- pyproject.toml +14 -0
- tasks.py +6 -23
- tlem.py +2 -4
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
__pycache__
|
2 |
-
|
|
|
|
|
|
1 |
__pycache__
|
2 |
+
*.ju.py
|
3 |
+
tests
|
4 |
+
|
pyproject.toml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "tlem"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["fecet <xiezej@gmail.com>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.10"
|
10 |
+
|
11 |
+
|
12 |
+
[build-system]
|
13 |
+
requires = ["poetry-core"]
|
14 |
+
build-backend = "poetry.core.masonry.api"
|
tasks.py
CHANGED
@@ -225,14 +225,11 @@ class Metrics:
|
|
225 |
return {"error": "predictions and references have different " "length"}
|
226 |
responses = [general_postprocess(pred) for pred in responses]
|
227 |
processed_answers = [[general_postprocess(j) for j in i] for i in answers]
|
228 |
-
|
229 |
-
for pred, ans
|
230 |
-
if
|
231 |
-
|
232 |
-
|
233 |
-
matched_answers.append(ans[0])
|
234 |
-
|
235 |
-
return responses, matched_answers
|
236 |
|
237 |
def bbh_mcq(responses: list[str], answers: list[str | int]):
|
238 |
if len(responses) != len(answers):
|
@@ -624,8 +621,6 @@ Text: [PROMPT]
|
|
624 |
Question: [QUESTION]
|
625 |
Anawer:"""
|
626 |
|
627 |
-
categories = ["validation"]
|
628 |
-
|
629 |
@classmethod
|
630 |
def prompt_drop(cls, example):
|
631 |
prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
|
@@ -633,19 +628,7 @@ Anawer:"""
|
|
633 |
)
|
634 |
|
635 |
validated_answers = example["answers_spans"]["spans"]
|
636 |
-
|
637 |
-
answers = []
|
638 |
-
for answer_item, answer_type in zip(validated_answers, validated_types):
|
639 |
-
# if answer_type == "number":
|
640 |
-
# answers.append(answer_item)
|
641 |
-
# elif any(answer_item['date'][i] for i in ['day', 'month', 'year']):
|
642 |
-
# d = [answer_item['date'][i] for i in ['day', 'month', 'year']]
|
643 |
-
# answers.append(' '.join(d).strip())
|
644 |
-
# else:
|
645 |
-
# for span in answer_item['spans']:
|
646 |
-
# answers.append(span)
|
647 |
-
answers.append(answer_item)
|
648 |
-
answers = list(set(answers))
|
649 |
|
650 |
return {cls.input_column: prompt, cls.label_column: answers}
|
651 |
|
|
|
225 |
return {"error": "predictions and references have different " "length"}
|
226 |
responses = [general_postprocess(pred) for pred in responses]
|
227 |
processed_answers = [[general_postprocess(j) for j in i] for i in answers]
|
228 |
+
scores = []
|
229 |
+
for pred, ans in zip(responses, processed_answers):
|
230 |
+
score = np.mean([1 if a in pred else 0 for a in ans])
|
231 |
+
scores.append(score)
|
232 |
+
return {"em": np.mean(scores)}
|
|
|
|
|
|
|
233 |
|
234 |
def bbh_mcq(responses: list[str], answers: list[str | int]):
|
235 |
if len(responses) != len(answers):
|
|
|
621 |
Question: [QUESTION]
|
622 |
Anawer:"""
|
623 |
|
|
|
|
|
624 |
@classmethod
|
625 |
def prompt_drop(cls, example):
|
626 |
prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
|
|
|
628 |
)
|
629 |
|
630 |
validated_answers = example["answers_spans"]["spans"]
|
631 |
+
answers = list(set(validated_answers))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
632 |
|
633 |
return {cls.input_column: prompt, cls.label_column: answers}
|
634 |
|
tlem.py
CHANGED
@@ -49,8 +49,7 @@ class ReasoningMetric(evaluate.Metric):
|
|
49 |
def _compute(self, responses, references):
|
50 |
return_value = getattr(Metrics, self.config_name)(responses, references)
|
51 |
match return_value:
|
52 |
-
case
|
53 |
-
extract_responses, extract_references = return_value
|
54 |
results = {
|
55 |
self.config_name: np.mean(
|
56 |
sync_pipe(lambda x, y: x == y)(
|
@@ -182,8 +181,7 @@ class Suite(EvaluationSuite):
|
|
182 |
def singleton(self, task):
|
183 |
try:
|
184 |
return self.tasks[self.tasks.index(task)]
|
185 |
-
|
186 |
-
except Exception as e:
|
187 |
self.tasks.append(task)
|
188 |
return self.tasks[-1]
|
189 |
|
|
|
49 |
def _compute(self, responses, references):
|
50 |
return_value = getattr(Metrics, self.config_name)(responses, references)
|
51 |
match return_value:
|
52 |
+
case extract_responses, extract_references:
|
|
|
53 |
results = {
|
54 |
self.config_name: np.mean(
|
55 |
sync_pipe(lambda x, y: x == y)(
|
|
|
181 |
def singleton(self, task):
|
182 |
try:
|
183 |
return self.tasks[self.tasks.index(task)]
|
184 |
+
except ValueError:
|
|
|
185 |
self.tasks.append(task)
|
186 |
return self.tasks[-1]
|
187 |
|