backend_demo

Paused

App Files Files Community

Shaltiel commited on Mar 18

Commit

5f1bc85

•

1 Parent(s): eb2a0ba

Added winograd tasks

Browse files

Files changed (3) hide show

custom_tasks.py +2 -1
src/about.py +2 -1
src/custom_tasks/winograd_task.py +57 -0

custom_tasks.py CHANGED Viewed

@@ -8,11 +8,12 @@ Author:
 """
 from src.custom_tasks.heq_task import *
 from src.custom_tasks.sentiment_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

 """
 from src.custom_tasks.heq_task import *
 from src.custom_tasks.sentiment_task import *
+from src.custom_tasks.winograd_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

src/about.py CHANGED Viewed

@@ -21,4 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0,custom|sentiment-acc|0|0"

 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
+TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)

src/custom_tasks/winograd_task.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import re
+import string
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.metrics import Metrics, MetricCategory
+from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
+from aenum import extend_enum
+import numpy as np
+from lighteval.tasks.requests import Doc
+from Levenshtein import distance
+import collections
+from lighteval.utils import as_list
+def winograd_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
+    if len(predictions)  > 1:
+        raise ValueError("Predictions should have one item")
+    # do some santizations, since some models produce more info
+    pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
+    return 1 if pred == golds[0] else 0
+winograd_acc_metric = CorpusLevelMetric(
+    metric="winograd_acc",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.ACCURACY,
+    corpus_level_fn=np.mean,
+    sample_level_fn=winograd_eval_fn
+)
+extend_enum(Metrics, 'winograd_acc_metric', winograd_acc_metric)
+def winograd_prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"],
+        choices=line["response"],
+        gold_index=0,
+        instruction="",
+    )
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+winograd_task = LightevalTaskConfig(
+    name="winograd-acc",
+    prompt_function="winograd_prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["custom"],
+    hf_repo="dicta-hebrew-llm-leaderboard/tests",
+    hf_subset="default",
+    hf_avail_splits=["winograd"],
+    evaluation_splits=["winograd"],
+    metric=['winograd_acc_metric'],
+    stop_sequence=['\n'],
+    generation_size=32
+)
+winograd_task.stop_sequence = as_list(winograd_task.stop_sequence)