import re import string from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.metrics import Metrics, MetricCategory from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase from aenum import extend_enum import numpy as np from lighteval.tasks.requests import Doc from Levenshtein import distance import collections from lighteval.utils import as_list import sacrebleu from ..envs import OWNER def trans_prompt_fn(line, task_name: str = None): """Defines how to go from a dataset line to a doc object. Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info about what this function should do in the README. """ return Doc( task_name=task_name, query=line["prompt"].strip(), choices=[line["response"][0].strip()], gold_index=[0], instruction="", ) def translation_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None): if len(predictions) > 1: raise ValueError("Predictions should have one item") return float(sacrebleu.sentence_bleu(hypothesis=predictions[0], references=golds).score / 100) sentence_bleu = CorpusLevelMetric( metric="sentence_bleu", sample_level_fn=translation_eval_fn, category=MetricCategory.GENERATIVE, use_case=MetricUseCase.TRANSLATION, corpus_level_fn=np.mean, higher_is_better=True, ) extend_enum(Metrics, 'sentence_bleu', sentence_bleu) # This is how you create a simple tasks (like hellaswag) which has one single subset # attached to it, and one evaluation possible. translation_task = LightevalTaskConfig( name="he-en-trans-bleu", prompt_function="trans_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py suite=["custom"], hf_repo=f"{OWNER}/tests", hf_subset="default", hf_avail_splits=["en2he", "he2en"], evaluation_splits=["en2he", "he2en"], metric=['sentence_bleu'], stop_sequence=['\n'], generation_size=256 )