backend_demo

Paused

App Files Files Community

Shaltiel commited on Mar 17, 2024

Commit

7798457

1 Parent(s): 8ff6e46

Added sentiment, added lots of monkey patching

Browse files

Files changed (6) hide show

custom_tasks.py +3 -2
main_backend_lighteval.py +3 -6
src/about.py +1 -1
src/backend/run_eval_suite_lighteval.py +9 -12
heq_task.py → src/custom_tasks/heq_task.py +3 -3
src/custom_tasks/sentiment_task.py +60 -0

custom_tasks.py CHANGED Viewed

@@ -6,12 +6,13 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then im
 Author:
 """
-from heq_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
-TASKS_TABLE = [task.as_dict() for task in [heq_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

 Author:
 """
+from src.custom_tasks.heq_task import *
+from src.custom_tasks.sentiment_task import *
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
+TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task]]
 if __name__ == "__main__":
     print(t["name"] for t in TASKS_TABLE)

main_backend_lighteval.py CHANGED Viewed

@@ -2,8 +2,6 @@ import logging
 import pprint
 import lighteval.models.endpoint_model
-orig_endpoint_model_greedy_until = lighteval.models.endpoint_model.InferenceEndpointModel.greedy_until
-orig_endpoint_model_process_batch_generate = lighteval.models.endpoint_model.InferenceEndpointModel.__process_batch_generate
 class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpointModel):
     @property
@@ -13,11 +11,10 @@ class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpoi
     def greedy_until(self, requests: list, *args, **kwargs):
         for request in requests:
             request.tokenized_context = self.tok_encode(request.context)
-        # using this and not super() because we don't want self to change
-        return orig_endpoint_model_greedy_until(self, requests, *args, **kwargs)
-    def __process_batch_generate(self, requests: list, returns_logits: bool):
-        return orig_endpoint_model_process_batch_generate(self, requests)
     @property
     def disable_tqdm(self) -> bool:

 import pprint
 import lighteval.models.endpoint_model
 class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpointModel):
     @property
     def greedy_until(self, requests: list, *args, **kwargs):
         for request in requests:
             request.tokenized_context = self.tok_encode(request.context)
+        return super().greedy_until(requests, *args, **kwargs)
+    def _InferenceEndpointModel__process_batch_generate(self, requests: list, returns_logits: bool):
+        return super()._InferenceEndpointModel__process_batch_generate(requests)
     @property
     def disable_tqdm(self) -> bool:

src/about.py CHANGED Viewed

@@ -21,4 +21,4 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0"

 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0,custom|sentiment-acc|0|0"

src/backend/run_eval_suite_lighteval.py CHANGED Viewed

@@ -5,7 +5,7 @@ from datetime import datetime
 from argparse import Namespace
 from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
-from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
 from src.backend.manage_requests import EvalRequest
 logging.getLogger("openai").setLevel(logging.WARNING)
@@ -32,19 +32,16 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
             "push_details_to_hub": True,
             "public_run": False,
             "cache_dir": CACHE_PATH,
-            "results_org": RESULTS_REPO,
             "output_dir": local_dir,
             "override_batch_size": batch_size,
             "custom_tasks": "custom_tasks.py",
             "tasks": task_names,
             "dataset_loading_processes": 24,
             "num_fewshot_seeds": 0,
-            "reuse_existing": True
     })
-    results = main(args)
-    return results
     try:
         results = main(args)
@@ -55,12 +52,12 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
         dumped = json.dumps(results, indent=2)
         print(dumped)
     except Exception as ex: # if eval failed, we force a cleanup
-        print(ex)
         env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
-        # model_config = create_model_config(args=args, accelerator=accelerator)
-        # model, _ = load_model(config=model_config, env_config=env_config)
-        # model.cleanup()
     return results

 from argparse import Namespace
 from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
+from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
 from src.backend.manage_requests import EvalRequest
 logging.getLogger("openai").setLevel(logging.WARNING)
             "push_details_to_hub": True,
             "public_run": False,
             "cache_dir": CACHE_PATH,
+            "results_org": OWNER,
             "output_dir": local_dir,
             "override_batch_size": batch_size,
             "custom_tasks": "custom_tasks.py",
             "tasks": task_names,
             "dataset_loading_processes": 24,
             "num_fewshot_seeds": 0,
+            "reuse_existing": False
     })
     try:
         results = main(args)
         dumped = json.dumps(results, indent=2)
         print(dumped)
     except Exception as ex: # if eval failed, we force a cleanup
+        import traceback
+        traceback.print_exception(ex)
         env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
+        args.reuse_existing = True
+        model_config = create_model_config(args=args, accelerator=accelerator)
+        model, _ = load_model(config=model_config, env_config=env_config)
+        model.cleanup()
     return results

heq_task.py → src/custom_tasks/heq_task.py RENAMED Viewed

@@ -70,7 +70,7 @@ def tlnls(a_gold, a_pred):
     else:
         return compute_f1(a_gold, a_pred)
-def heq_eval_fn(golds: list[str], predictions: list[str]):
     if len(predictions)  > 1:
         raise ValueError("Predictions should have one item")
     return max([tlnls(x, predictions[0]) for x in golds])
@@ -98,7 +98,6 @@ def heq_prompt_fn(line, task_name: str = None):
         instruction="",
     )
-## EVAL WITH NO SUBSET ##
 # This is how you create a simple tasks (like hellaswag) which has one single subset
 # attached to it, and one evaluation possible.
 heq_task = LightevalTaskConfig(
@@ -110,6 +109,7 @@ heq_task = LightevalTaskConfig(
     hf_avail_splits=["heq"],
     evaluation_splits=["heq"],
     metric=['heq_tlnls_metric'],
-    stop_sequence=['\n']
 )
 heq_task.stop_sequence = as_list(heq_task.stop_sequence)

     else:
         return compute_f1(a_gold, a_pred)
+def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
     if len(predictions)  > 1:
         raise ValueError("Predictions should have one item")
     return max([tlnls(x, predictions[0]) for x in golds])
         instruction="",
     )
 # This is how you create a simple tasks (like hellaswag) which has one single subset
 # attached to it, and one evaluation possible.
 heq_task = LightevalTaskConfig(
     hf_avail_splits=["heq"],
     evaluation_splits=["heq"],
     metric=['heq_tlnls_metric'],
+    stop_sequence=['\n'],
+    generation_size=64
 )
 heq_task.stop_sequence = as_list(heq_task.stop_sequence)

src/custom_tasks/sentiment_task.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import re
+import string
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.metrics import Metrics, MetricCategory
+from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
+from aenum import extend_enum
+import numpy as np
+from lighteval.tasks.requests import Doc
+from Levenshtein import distance
+import collections
+from lighteval.utils import as_list
+def sentiment_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
+    if len(predictions)  > 1:
+        raise ValueError("Predictions should have one item")
+    # do some santizations, since some models produce more info
+    pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
+    pred = re.sub(r'^100%', '', pred) # remove 100% at beginning, some gemma weirdness
+    pred = pred.strip()
+    return 1 if pred == golds[0] else 0
+sentiment_acc_metric = CorpusLevelMetric(
+    metric="sentiment_acc",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.ACCURACY,
+    corpus_level_fn=np.mean,
+    sample_level_fn=sentiment_eval_fn
+)
+extend_enum(Metrics, 'sentiment_acc_metric', sentiment_acc_metric)
+def sentiment_prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"],
+        choices=line["response"],
+        gold_index=0,
+        instruction="",
+    )
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+sentiment_task = LightevalTaskConfig(
+    name="sentiment-acc",
+    prompt_function="sentiment_prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["custom"],
+    hf_repo="dicta-hebrew-llm-leaderboard/tests",
+    hf_subset="default",
+    hf_avail_splits=["sentiment"],
+    evaluation_splits=["sentiment"],
+    metric=['sentiment_acc_metric'],
+    stop_sequence=['\n'],
+    generation_size=32
+)
+sentiment_task.stop_sequence = as_list(sentiment_task.stop_sequence)