facat commited on
Commit
c6f1343
1 Parent(s): 0c75eca
Files changed (4) hide show
  1. .gitignore +3 -1
  2. pyproject.toml +14 -0
  3. tasks.py +6 -23
  4. tlem.py +2 -4
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  __pycache__
2
- tlem.ju.py
 
 
 
1
  __pycache__
2
+ *.ju.py
3
+ tests
4
+
pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "tlem"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["fecet <xiezej@gmail.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.10"
10
+
11
+
12
+ [build-system]
13
+ requires = ["poetry-core"]
14
+ build-backend = "poetry.core.masonry.api"
tasks.py CHANGED
@@ -225,14 +225,11 @@ class Metrics:
225
  return {"error": "predictions and references have different " "length"}
226
  responses = [general_postprocess(pred) for pred in responses]
227
  processed_answers = [[general_postprocess(j) for j in i] for i in answers]
228
- matched_answers = []
229
- for pred, ans, origin_ans in zip(responses, processed_answers, answers):
230
- if pred in ans or pred in origin_ans:
231
- matched_answers.append(pred)
232
- else:
233
- matched_answers.append(ans[0])
234
-
235
- return responses, matched_answers
236
 
237
  def bbh_mcq(responses: list[str], answers: list[str | int]):
238
  if len(responses) != len(answers):
@@ -624,8 +621,6 @@ Text: [PROMPT]
624
  Question: [QUESTION]
625
  Anawer:"""
626
 
627
- categories = ["validation"]
628
-
629
  @classmethod
630
  def prompt_drop(cls, example):
631
  prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
@@ -633,19 +628,7 @@ Anawer:"""
633
  )
634
 
635
  validated_answers = example["answers_spans"]["spans"]
636
- validated_types = example["answers_spans"]["types"]
637
- answers = []
638
- for answer_item, answer_type in zip(validated_answers, validated_types):
639
- # if answer_type == "number":
640
- # answers.append(answer_item)
641
- # elif any(answer_item['date'][i] for i in ['day', 'month', 'year']):
642
- # d = [answer_item['date'][i] for i in ['day', 'month', 'year']]
643
- # answers.append(' '.join(d).strip())
644
- # else:
645
- # for span in answer_item['spans']:
646
- # answers.append(span)
647
- answers.append(answer_item)
648
- answers = list(set(answers))
649
 
650
  return {cls.input_column: prompt, cls.label_column: answers}
651
 
 
225
  return {"error": "predictions and references have different " "length"}
226
  responses = [general_postprocess(pred) for pred in responses]
227
  processed_answers = [[general_postprocess(j) for j in i] for i in answers]
228
+ scores = []
229
+ for pred, ans in zip(responses, processed_answers):
230
+ score = np.mean([1 if a in pred else 0 for a in ans])
231
+ scores.append(score)
232
+ return {"em": np.mean(scores)}
 
 
 
233
 
234
  def bbh_mcq(responses: list[str], answers: list[str | int]):
235
  if len(responses) != len(answers):
 
621
  Question: [QUESTION]
622
  Anawer:"""
623
 
 
 
624
  @classmethod
625
  def prompt_drop(cls, example):
626
  prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
 
628
  )
629
 
630
  validated_answers = example["answers_spans"]["spans"]
631
+ answers = list(set(validated_answers))
 
 
 
 
 
 
 
 
 
 
 
 
632
 
633
  return {cls.input_column: prompt, cls.label_column: answers}
634
 
tlem.py CHANGED
@@ -49,8 +49,7 @@ class ReasoningMetric(evaluate.Metric):
49
  def _compute(self, responses, references):
50
  return_value = getattr(Metrics, self.config_name)(responses, references)
51
  match return_value:
52
- case tuple():
53
- extract_responses, extract_references = return_value
54
  results = {
55
  self.config_name: np.mean(
56
  sync_pipe(lambda x, y: x == y)(
@@ -182,8 +181,7 @@ class Suite(EvaluationSuite):
182
  def singleton(self, task):
183
  try:
184
  return self.tasks[self.tasks.index(task)]
185
-
186
- except Exception as e:
187
  self.tasks.append(task)
188
  return self.tasks[-1]
189
 
 
49
  def _compute(self, responses, references):
50
  return_value = getattr(Metrics, self.config_name)(responses, references)
51
  match return_value:
52
+ case extract_responses, extract_references:
 
53
  results = {
54
  self.config_name: np.mean(
55
  sync_pipe(lambda x, y: x == y)(
 
181
  def singleton(self, task):
182
  try:
183
  return self.tasks[self.tasks.index(task)]
184
+ except ValueError:
 
185
  self.tasks.append(task)
186
  return self.tasks[-1]
187