backend_demo

Paused

Shaltiel commited on Mar 19, 2024

Commit

adf0b2e

1 Parent(s): fe281bf

Fixed whitespace for prediction

Files changed (7) hide show

custom_tasks.py CHANGED Viewed

@@ -9,13 +9,11 @@ Author:
 from src.custom_tasks.heq_task import *
 from src.custom_tasks.sentiment_task import *
 from src.custom_tasks.winograd_task import *
-from src.custom_tasks.commonsense_task import *
-from src.custom_tasks.arc_challenge_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task, commonsense_qa_task, arc_challenge_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

 from src.custom_tasks.heq_task import *
 from src.custom_tasks.sentiment_task import *
 from src.custom_tasks.winograd_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

src/about.py CHANGED Viewed

@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'arc:challenge']
-TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)

 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
+TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'

src/custom_tasks/arc_challenge_task.py DELETED Viewed

@@ -1,24 +0,0 @@
-import re
-import string
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.metrics import Metrics, MetricCategory
-from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
-from aenum import extend_enum
-import numpy as np
-from lighteval.tasks.requests import Doc
-from Levenshtein import distance
-import collections
-from lighteval.utils import as_list
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
-arc_challenge_task = LightevalTaskConfig(
-    name="arc:challenge",
-    prompt_function="arc",
-    hf_repo="ai2_arc",
-    hf_subset="ARC-Challenge",
-    evaluation_splits=["test"],
-    generation_size=1,
-    metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    trust_dataset=True,
-    stop_sequence=["\n"],
-)

src/custom_tasks/commonsense_task.py DELETED Viewed

@@ -1,31 +0,0 @@
-import re
-import string
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.metrics import Metrics, MetricCategory
-from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
-from aenum import extend_enum
-import numpy as np
-from lighteval.tasks.requests import Doc
-from Levenshtein import distance
-import collections
-from lighteval.utils import as_list
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
-def commonsense_qa_prompt(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=line["question"],
-        choices=[f" {c}" for c in line["choices"]["text"]],
-        gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
-        instruction="",
-    )
-commonsense_qa_task = LightevalTaskConfig(
-    name="commonsense_qa",
-    prompt_function="commonsense_qa_prompt",
-    hf_repo="commonsense_qa",
-    hf_subset="default",
-    metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
-    trust_dataset=True,
-    stop_sequence=["\n"],
-),

src/custom_tasks/heq_task.py CHANGED Viewed

@@ -73,7 +73,7 @@ def tlnls(a_gold, a_pred):
 def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
     if len(predictions)  > 1:
         raise ValueError("Predictions should have one item")
-    pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
     return max([tlnls(x, pred) for x in golds])
 heq_tlnls_metric = CorpusLevelMetric(
@@ -93,8 +93,8 @@ def heq_prompt_fn(line, task_name: str = None):
     """
     return Doc(
         task_name=task_name,
-        query=line["prompt"],
-        choices=line["response"],
         gold_index=list(range(len(line["response"]))),
         instruction="",
     )

 def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
     if len(predictions)  > 1:
         raise ValueError("Predictions should have one item")
+    pred = re.sub('<[^>]+>', '', predictions[0]).strip() # remove xml tags
     return max([tlnls(x, pred) for x in golds])
 heq_tlnls_metric = CorpusLevelMetric(
     """
     return Doc(
         task_name=task_name,
+        query=line["prompt"].strip(),
+        choices=[resp.strip() for resp in line["response"]],
         gold_index=list(range(len(line["response"]))),
         instruction="",
     )

src/custom_tasks/sentiment_task.py CHANGED Viewed

@@ -37,8 +37,8 @@ def sentiment_prompt_fn(line, task_name: str = None):
     """
     return Doc(
         task_name=task_name,
-        query=line["prompt"],
-        choices=line["response"],
         gold_index=0,
         instruction="",
     )

     """
     return Doc(
         task_name=task_name,
+        query=line["prompt"].strip(),
+        choices=[resp.strip() for resp in line["response"]],
         gold_index=0,
         instruction="",
     )

src/custom_tasks/winograd_task.py CHANGED Viewed

@@ -34,8 +34,8 @@ def winograd_prompt_fn(line, task_name: str = None):
     """
     return Doc(
         task_name=task_name,
-        query=line["prompt"],
-        choices=line["response"],
         gold_index=0,
         instruction="",
     )

     """
     return Doc(
         task_name=task_name,
+        query=line["prompt"].strip(),
+        choices=[resp.strip() for resp in line["response"]],
         gold_index=0,
         instruction="",
     )