import re import string from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.metrics import Metrics, MetricCategory from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase from aenum import extend_enum import numpy as np from lighteval.tasks.requests import Doc from Levenshtein import distance import collections from lighteval.utils import as_list def sentiment_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None): if len(predictions) > 1: raise ValueError("Predictions should have one item") # do some santizations, since some models produce more info pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags pred = re.sub(r'^100%', '', pred) # remove 100% at beginning, some gemma weirdness pred = pred.strip() return 1 if pred == golds[0] else 0 sentiment_acc_metric = CorpusLevelMetric( metric="sentiment_acc", higher_is_better=True, category=MetricCategory.GENERATIVE, use_case=MetricUseCase.ACCURACY, corpus_level_fn=np.mean, sample_level_fn=sentiment_eval_fn ) extend_enum(Metrics, 'sentiment_acc_metric', sentiment_acc_metric) def sentiment_prompt_fn(line, task_name: str = None): """Defines how to go from a dataset line to a doc object. Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info about what this function should do in the README. """ return Doc( task_name=task_name, query=line["prompt"], choices=line["response"], gold_index=0, instruction="", ) # This is how you create a simple tasks (like hellaswag) which has one single subset # attached to it, and one evaluation possible. sentiment_task = LightevalTaskConfig( name="sentiment-acc", prompt_function="sentiment_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py suite=["custom"], hf_repo="dicta-hebrew-llm-leaderboard/tests", hf_subset="default", hf_avail_splits=["sentiment"], evaluation_splits=["sentiment"], metric=['sentiment_acc_metric'], stop_sequence=['\n'], generation_size=32 ) sentiment_task.stop_sequence = as_list(sentiment_task.stop_sequence)