Shaltiel commited on
Commit
7798457
1 Parent(s): 8ff6e46

Added sentiment, added lots of monkey patching

Browse files
custom_tasks.py CHANGED
@@ -6,12 +6,13 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then im
6
 
7
  Author:
8
  """
9
- from heq_task import *
 
10
 
11
  ## MODULE LOGIC
12
  # You should not need to touch this
13
  # Convert to dict for lighteval
14
- TASKS_TABLE = [task.as_dict() for task in [heq_task]]
15
 
16
  if __name__ == "__main__":
17
  print(t["name"] for t in TASKS_TABLE)
 
6
 
7
  Author:
8
  """
9
+ from src.custom_tasks.heq_task import *
10
+ from src.custom_tasks.sentiment_task import *
11
 
12
  ## MODULE LOGIC
13
  # You should not need to touch this
14
  # Convert to dict for lighteval
15
+ TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task]]
16
 
17
  if __name__ == "__main__":
18
  print(t["name"] for t in TASKS_TABLE)
main_backend_lighteval.py CHANGED
@@ -2,8 +2,6 @@ import logging
2
  import pprint
3
 
4
  import lighteval.models.endpoint_model
5
- orig_endpoint_model_greedy_until = lighteval.models.endpoint_model.InferenceEndpointModel.greedy_until
6
- orig_endpoint_model_process_batch_generate = lighteval.models.endpoint_model.InferenceEndpointModel.__process_batch_generate
7
  class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpointModel):
8
 
9
  @property
@@ -13,11 +11,10 @@ class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpoi
13
  def greedy_until(self, requests: list, *args, **kwargs):
14
  for request in requests:
15
  request.tokenized_context = self.tok_encode(request.context)
16
- # using this and not super() because we don't want self to change
17
- return orig_endpoint_model_greedy_until(self, requests, *args, **kwargs)
18
 
19
- def __process_batch_generate(self, requests: list, returns_logits: bool):
20
- return orig_endpoint_model_process_batch_generate(self, requests)
21
 
22
  @property
23
  def disable_tqdm(self) -> bool:
 
2
  import pprint
3
 
4
  import lighteval.models.endpoint_model
 
 
5
  class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpointModel):
6
 
7
  @property
 
11
  def greedy_until(self, requests: list, *args, **kwargs):
12
  for request in requests:
13
  request.tokenized_context = self.tok_encode(request.context)
14
+ return super().greedy_until(requests, *args, **kwargs)
 
15
 
16
+ def _InferenceEndpointModel__process_batch_generate(self, requests: list, returns_logits: bool):
17
+ return super()._InferenceEndpointModel__process_batch_generate(requests)
18
 
19
  @property
20
  def disable_tqdm(self) -> bool:
src/about.py CHANGED
@@ -21,4 +21,4 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
- TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0"
 
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
+ TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0,custom|sentiment-acc|0|0"
src/backend/run_eval_suite_lighteval.py CHANGED
@@ -5,7 +5,7 @@ from datetime import datetime
5
  from argparse import Namespace
6
 
7
  from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
8
- from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
9
  from src.backend.manage_requests import EvalRequest
10
 
11
  logging.getLogger("openai").setLevel(logging.WARNING)
@@ -32,19 +32,16 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
32
  "push_details_to_hub": True,
33
  "public_run": False,
34
  "cache_dir": CACHE_PATH,
35
- "results_org": RESULTS_REPO,
36
  "output_dir": local_dir,
37
  "override_batch_size": batch_size,
38
  "custom_tasks": "custom_tasks.py",
39
  "tasks": task_names,
40
  "dataset_loading_processes": 24,
41
  "num_fewshot_seeds": 0,
42
- "reuse_existing": True
43
  })
44
 
45
- results = main(args)
46
- return results
47
-
48
  try:
49
  results = main(args)
50
 
@@ -55,12 +52,12 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
55
  dumped = json.dumps(results, indent=2)
56
  print(dumped)
57
  except Exception as ex: # if eval failed, we force a cleanup
58
- print(ex)
 
59
  env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
60
-
61
- # model_config = create_model_config(args=args, accelerator=accelerator)
62
- # model, _ = load_model(config=model_config, env_config=env_config)
63
- # model.cleanup()
64
-
65
 
66
  return results
 
5
  from argparse import Namespace
6
 
7
  from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
8
+ from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
9
  from src.backend.manage_requests import EvalRequest
10
 
11
  logging.getLogger("openai").setLevel(logging.WARNING)
 
32
  "push_details_to_hub": True,
33
  "public_run": False,
34
  "cache_dir": CACHE_PATH,
35
+ "results_org": OWNER,
36
  "output_dir": local_dir,
37
  "override_batch_size": batch_size,
38
  "custom_tasks": "custom_tasks.py",
39
  "tasks": task_names,
40
  "dataset_loading_processes": 24,
41
  "num_fewshot_seeds": 0,
42
+ "reuse_existing": False
43
  })
44
 
 
 
 
45
  try:
46
  results = main(args)
47
 
 
52
  dumped = json.dumps(results, indent=2)
53
  print(dumped)
54
  except Exception as ex: # if eval failed, we force a cleanup
55
+ import traceback
56
+ traceback.print_exception(ex)
57
  env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
58
+ args.reuse_existing = True
59
+ model_config = create_model_config(args=args, accelerator=accelerator)
60
+ model, _ = load_model(config=model_config, env_config=env_config)
61
+ model.cleanup()
 
62
 
63
  return results
heq_task.py → src/custom_tasks/heq_task.py RENAMED
@@ -70,7 +70,7 @@ def tlnls(a_gold, a_pred):
70
  else:
71
  return compute_f1(a_gold, a_pred)
72
 
73
- def heq_eval_fn(golds: list[str], predictions: list[str]):
74
  if len(predictions) > 1:
75
  raise ValueError("Predictions should have one item")
76
  return max([tlnls(x, predictions[0]) for x in golds])
@@ -98,7 +98,6 @@ def heq_prompt_fn(line, task_name: str = None):
98
  instruction="",
99
  )
100
 
101
- ## EVAL WITH NO SUBSET ##
102
  # This is how you create a simple tasks (like hellaswag) which has one single subset
103
  # attached to it, and one evaluation possible.
104
  heq_task = LightevalTaskConfig(
@@ -110,6 +109,7 @@ heq_task = LightevalTaskConfig(
110
  hf_avail_splits=["heq"],
111
  evaluation_splits=["heq"],
112
  metric=['heq_tlnls_metric'],
113
- stop_sequence=['\n']
 
114
  )
115
  heq_task.stop_sequence = as_list(heq_task.stop_sequence)
 
70
  else:
71
  return compute_f1(a_gold, a_pred)
72
 
73
+ def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
74
  if len(predictions) > 1:
75
  raise ValueError("Predictions should have one item")
76
  return max([tlnls(x, predictions[0]) for x in golds])
 
98
  instruction="",
99
  )
100
 
 
101
  # This is how you create a simple tasks (like hellaswag) which has one single subset
102
  # attached to it, and one evaluation possible.
103
  heq_task = LightevalTaskConfig(
 
109
  hf_avail_splits=["heq"],
110
  evaluation_splits=["heq"],
111
  metric=['heq_tlnls_metric'],
112
+ stop_sequence=['\n'],
113
+ generation_size=64
114
  )
115
  heq_task.stop_sequence = as_list(heq_task.stop_sequence)
src/custom_tasks/sentiment_task.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
+ from lighteval.metrics import Metrics, MetricCategory
5
+ from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
+ from aenum import extend_enum
7
+ import numpy as np
8
+ from lighteval.tasks.requests import Doc
9
+ from Levenshtein import distance
10
+ import collections
11
+ from lighteval.utils import as_list
12
+
13
+ def sentiment_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
14
+ if len(predictions) > 1:
15
+ raise ValueError("Predictions should have one item")
16
+ # do some santizations, since some models produce more info
17
+ pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
18
+ pred = re.sub(r'^100%', '', pred) # remove 100% at beginning, some gemma weirdness
19
+ pred = pred.strip()
20
+
21
+ return 1 if pred == golds[0] else 0
22
+
23
+ sentiment_acc_metric = CorpusLevelMetric(
24
+ metric="sentiment_acc",
25
+ higher_is_better=True,
26
+ category=MetricCategory.GENERATIVE,
27
+ use_case=MetricUseCase.ACCURACY,
28
+ corpus_level_fn=np.mean,
29
+ sample_level_fn=sentiment_eval_fn
30
+ )
31
+ extend_enum(Metrics, 'sentiment_acc_metric', sentiment_acc_metric)
32
+
33
+ def sentiment_prompt_fn(line, task_name: str = None):
34
+ """Defines how to go from a dataset line to a doc object.
35
+ Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
36
+ about what this function should do in the README.
37
+ """
38
+ return Doc(
39
+ task_name=task_name,
40
+ query=line["prompt"],
41
+ choices=line["response"],
42
+ gold_index=0,
43
+ instruction="",
44
+ )
45
+
46
+ # This is how you create a simple tasks (like hellaswag) which has one single subset
47
+ # attached to it, and one evaluation possible.
48
+ sentiment_task = LightevalTaskConfig(
49
+ name="sentiment-acc",
50
+ prompt_function="sentiment_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
51
+ suite=["custom"],
52
+ hf_repo="dicta-hebrew-llm-leaderboard/tests",
53
+ hf_subset="default",
54
+ hf_avail_splits=["sentiment"],
55
+ evaluation_splits=["sentiment"],
56
+ metric=['sentiment_acc_metric'],
57
+ stop_sequence=['\n'],
58
+ generation_size=32
59
+ )
60
+ sentiment_task.stop_sequence = as_list(sentiment_task.stop_sequence)