Spaces:
Paused
Paused
Added sentiment, added lots of monkey patching
Browse files- custom_tasks.py +3 -2
- main_backend_lighteval.py +3 -6
- src/about.py +1 -1
- src/backend/run_eval_suite_lighteval.py +9 -12
- heq_task.py → src/custom_tasks/heq_task.py +3 -3
- src/custom_tasks/sentiment_task.py +60 -0
custom_tasks.py
CHANGED
@@ -6,12 +6,13 @@ This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then im
|
|
6 |
|
7 |
Author:
|
8 |
"""
|
9 |
-
from heq_task import *
|
|
|
10 |
|
11 |
## MODULE LOGIC
|
12 |
# You should not need to touch this
|
13 |
# Convert to dict for lighteval
|
14 |
-
TASKS_TABLE = [task.as_dict() for task in [heq_task]]
|
15 |
|
16 |
if __name__ == "__main__":
|
17 |
print(t["name"] for t in TASKS_TABLE)
|
|
|
6 |
|
7 |
Author:
|
8 |
"""
|
9 |
+
from src.custom_tasks.heq_task import *
|
10 |
+
from src.custom_tasks.sentiment_task import *
|
11 |
|
12 |
## MODULE LOGIC
|
13 |
# You should not need to touch this
|
14 |
# Convert to dict for lighteval
|
15 |
+
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task]]
|
16 |
|
17 |
if __name__ == "__main__":
|
18 |
print(t["name"] for t in TASKS_TABLE)
|
main_backend_lighteval.py
CHANGED
@@ -2,8 +2,6 @@ import logging
|
|
2 |
import pprint
|
3 |
|
4 |
import lighteval.models.endpoint_model
|
5 |
-
orig_endpoint_model_greedy_until = lighteval.models.endpoint_model.InferenceEndpointModel.greedy_until
|
6 |
-
orig_endpoint_model_process_batch_generate = lighteval.models.endpoint_model.InferenceEndpointModel.__process_batch_generate
|
7 |
class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpointModel):
|
8 |
|
9 |
@property
|
@@ -13,11 +11,10 @@ class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpoi
|
|
13 |
def greedy_until(self, requests: list, *args, **kwargs):
|
14 |
for request in requests:
|
15 |
request.tokenized_context = self.tok_encode(request.context)
|
16 |
-
|
17 |
-
return orig_endpoint_model_greedy_until(self, requests, *args, **kwargs)
|
18 |
|
19 |
-
def
|
20 |
-
return
|
21 |
|
22 |
@property
|
23 |
def disable_tqdm(self) -> bool:
|
|
|
2 |
import pprint
|
3 |
|
4 |
import lighteval.models.endpoint_model
|
|
|
|
|
5 |
class GoodInferenceEndpointModel(lighteval.models.endpoint_model.InferenceEndpointModel):
|
6 |
|
7 |
@property
|
|
|
11 |
def greedy_until(self, requests: list, *args, **kwargs):
|
12 |
for request in requests:
|
13 |
request.tokenized_context = self.tok_encode(request.context)
|
14 |
+
return super().greedy_until(requests, *args, **kwargs)
|
|
|
15 |
|
16 |
+
def _InferenceEndpointModel__process_batch_generate(self, requests: list, returns_logits: bool):
|
17 |
+
return super()._InferenceEndpointModel__process_batch_generate(requests)
|
18 |
|
19 |
@property
|
20 |
def disable_tqdm(self) -> bool:
|
src/about.py
CHANGED
@@ -21,4 +21,4 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
-
TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0"
|
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
+
TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0,custom|sentiment-acc|0|0"
|
src/backend/run_eval_suite_lighteval.py
CHANGED
@@ -5,7 +5,7 @@ from datetime import datetime
|
|
5 |
from argparse import Namespace
|
6 |
|
7 |
from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
|
8 |
-
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
|
9 |
from src.backend.manage_requests import EvalRequest
|
10 |
|
11 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
@@ -32,19 +32,16 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
32 |
"push_details_to_hub": True,
|
33 |
"public_run": False,
|
34 |
"cache_dir": CACHE_PATH,
|
35 |
-
"results_org":
|
36 |
"output_dir": local_dir,
|
37 |
"override_batch_size": batch_size,
|
38 |
"custom_tasks": "custom_tasks.py",
|
39 |
"tasks": task_names,
|
40 |
"dataset_loading_processes": 24,
|
41 |
"num_fewshot_seeds": 0,
|
42 |
-
"reuse_existing":
|
43 |
})
|
44 |
|
45 |
-
results = main(args)
|
46 |
-
return results
|
47 |
-
|
48 |
try:
|
49 |
results = main(args)
|
50 |
|
@@ -55,12 +52,12 @@ def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int,
|
|
55 |
dumped = json.dumps(results, indent=2)
|
56 |
print(dumped)
|
57 |
except Exception as ex: # if eval failed, we force a cleanup
|
58 |
-
|
|
|
59 |
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
|
66 |
return results
|
|
|
5 |
from argparse import Namespace
|
6 |
|
7 |
from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
|
8 |
+
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN, OWNER
|
9 |
from src.backend.manage_requests import EvalRequest
|
10 |
|
11 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
|
|
32 |
"push_details_to_hub": True,
|
33 |
"public_run": False,
|
34 |
"cache_dir": CACHE_PATH,
|
35 |
+
"results_org": OWNER,
|
36 |
"output_dir": local_dir,
|
37 |
"override_batch_size": batch_size,
|
38 |
"custom_tasks": "custom_tasks.py",
|
39 |
"tasks": task_names,
|
40 |
"dataset_loading_processes": 24,
|
41 |
"num_fewshot_seeds": 0,
|
42 |
+
"reuse_existing": False
|
43 |
})
|
44 |
|
|
|
|
|
|
|
45 |
try:
|
46 |
results = main(args)
|
47 |
|
|
|
52 |
dumped = json.dumps(results, indent=2)
|
53 |
print(dumped)
|
54 |
except Exception as ex: # if eval failed, we force a cleanup
|
55 |
+
import traceback
|
56 |
+
traceback.print_exception(ex)
|
57 |
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
|
58 |
+
args.reuse_existing = True
|
59 |
+
model_config = create_model_config(args=args, accelerator=accelerator)
|
60 |
+
model, _ = load_model(config=model_config, env_config=env_config)
|
61 |
+
model.cleanup()
|
|
|
62 |
|
63 |
return results
|
heq_task.py → src/custom_tasks/heq_task.py
RENAMED
@@ -70,7 +70,7 @@ def tlnls(a_gold, a_pred):
|
|
70 |
else:
|
71 |
return compute_f1(a_gold, a_pred)
|
72 |
|
73 |
-
def heq_eval_fn(golds: list[str], predictions: list[str]):
|
74 |
if len(predictions) > 1:
|
75 |
raise ValueError("Predictions should have one item")
|
76 |
return max([tlnls(x, predictions[0]) for x in golds])
|
@@ -98,7 +98,6 @@ def heq_prompt_fn(line, task_name: str = None):
|
|
98 |
instruction="",
|
99 |
)
|
100 |
|
101 |
-
## EVAL WITH NO SUBSET ##
|
102 |
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
103 |
# attached to it, and one evaluation possible.
|
104 |
heq_task = LightevalTaskConfig(
|
@@ -110,6 +109,7 @@ heq_task = LightevalTaskConfig(
|
|
110 |
hf_avail_splits=["heq"],
|
111 |
evaluation_splits=["heq"],
|
112 |
metric=['heq_tlnls_metric'],
|
113 |
-
stop_sequence=['\n']
|
|
|
114 |
)
|
115 |
heq_task.stop_sequence = as_list(heq_task.stop_sequence)
|
|
|
70 |
else:
|
71 |
return compute_f1(a_gold, a_pred)
|
72 |
|
73 |
+
def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
|
74 |
if len(predictions) > 1:
|
75 |
raise ValueError("Predictions should have one item")
|
76 |
return max([tlnls(x, predictions[0]) for x in golds])
|
|
|
98 |
instruction="",
|
99 |
)
|
100 |
|
|
|
101 |
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
102 |
# attached to it, and one evaluation possible.
|
103 |
heq_task = LightevalTaskConfig(
|
|
|
109 |
hf_avail_splits=["heq"],
|
110 |
evaluation_splits=["heq"],
|
111 |
metric=['heq_tlnls_metric'],
|
112 |
+
stop_sequence=['\n'],
|
113 |
+
generation_size=64
|
114 |
)
|
115 |
heq_task.stop_sequence = as_list(heq_task.stop_sequence)
|
src/custom_tasks/sentiment_task.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
+
from lighteval.metrics import Metrics, MetricCategory
|
5 |
+
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
+
from aenum import extend_enum
|
7 |
+
import numpy as np
|
8 |
+
from lighteval.tasks.requests import Doc
|
9 |
+
from Levenshtein import distance
|
10 |
+
import collections
|
11 |
+
from lighteval.utils import as_list
|
12 |
+
|
13 |
+
def sentiment_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
|
14 |
+
if len(predictions) > 1:
|
15 |
+
raise ValueError("Predictions should have one item")
|
16 |
+
# do some santizations, since some models produce more info
|
17 |
+
pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
|
18 |
+
pred = re.sub(r'^100%', '', pred) # remove 100% at beginning, some gemma weirdness
|
19 |
+
pred = pred.strip()
|
20 |
+
|
21 |
+
return 1 if pred == golds[0] else 0
|
22 |
+
|
23 |
+
sentiment_acc_metric = CorpusLevelMetric(
|
24 |
+
metric="sentiment_acc",
|
25 |
+
higher_is_better=True,
|
26 |
+
category=MetricCategory.GENERATIVE,
|
27 |
+
use_case=MetricUseCase.ACCURACY,
|
28 |
+
corpus_level_fn=np.mean,
|
29 |
+
sample_level_fn=sentiment_eval_fn
|
30 |
+
)
|
31 |
+
extend_enum(Metrics, 'sentiment_acc_metric', sentiment_acc_metric)
|
32 |
+
|
33 |
+
def sentiment_prompt_fn(line, task_name: str = None):
|
34 |
+
"""Defines how to go from a dataset line to a doc object.
|
35 |
+
Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
|
36 |
+
about what this function should do in the README.
|
37 |
+
"""
|
38 |
+
return Doc(
|
39 |
+
task_name=task_name,
|
40 |
+
query=line["prompt"],
|
41 |
+
choices=line["response"],
|
42 |
+
gold_index=0,
|
43 |
+
instruction="",
|
44 |
+
)
|
45 |
+
|
46 |
+
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
47 |
+
# attached to it, and one evaluation possible.
|
48 |
+
sentiment_task = LightevalTaskConfig(
|
49 |
+
name="sentiment-acc",
|
50 |
+
prompt_function="sentiment_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
|
51 |
+
suite=["custom"],
|
52 |
+
hf_repo="dicta-hebrew-llm-leaderboard/tests",
|
53 |
+
hf_subset="default",
|
54 |
+
hf_avail_splits=["sentiment"],
|
55 |
+
evaluation_splits=["sentiment"],
|
56 |
+
metric=['sentiment_acc_metric'],
|
57 |
+
stop_sequence=['\n'],
|
58 |
+
generation_size=32
|
59 |
+
)
|
60 |
+
sentiment_task.stop_sequence = as_list(sentiment_task.stop_sequence)
|