# %% try: from ipytorch import logging except Exception as e: import logging from typing import Any, Optional, Protocol, Iterable, Callable from tqdm.auto import tqdm from evaluate.evaluation_suite import EvaluationSuite # %% # %cd ../tlem # %load_ext ipytorch # %ls from utils import ( NUMERIC_IN_ZH, extract_choice_ans, extract_numeric, get_answer, is_equiv, ) from dataclasses import dataclass, field from datasets import load_dataset, Dataset from functools import cached_property TextGenerationPipeline = Callable[[Iterable[str]], list[str]] from evaluate import load def fake_pipeline(prompts: Iterable[str]) -> list[str]: return [prompt for prompt in tqdm(prompts)] @dataclass class Task: dataset_name: str | tuple[str, str] = ("gsm8k", "main") split: str = "test" # metrics: list[str] = field(default_factory=list) metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k") input_column: str = "question" label_column: str = "answer" prompt: Optional[Callable | str] = None @cached_property def name(self): return ( self.dataset_name if isinstance(self.dataset_name, str) else self.dataset_name[0] ) + f"-{self.split}" @cached_property def samples(self): return self.dataset[self.input_column] @cached_property def dataset(self): ds = load_dataset( *self.dataset_name if isinstance(self.dataset_name, tuple) else self.dataset_name, split=self.split, ) if self.prompt is not None: ds = ds.map( lambda example: { self.input_column: self.prompt.format( input_column=example[self.input_column] ) } if isinstance(self.prompt, str) else self.prompt(example), ) return ds @cached_property def metric(self): metric = ( load(self.metric_name) if isinstance(self.metric_name, str) else load(*self.metric_name) ) return metric def run(self, pipeline: TextGenerationPipeline = fake_pipeline): outputs = pipeline(self.samples) return self.metric.compute( responses=outputs, references=self.dataset[self.label_column] ) class Metrics: def gsm8k(responses: list[str], answers: list[str | int]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response) gold = extract_numeric(answer) if isinstance(answer, str) else str(answer) scores.append(1.0 * (pred == gold)) return scores def MATH(responses: list[str], answers: list[str]): scores = [] for response, answer in zip(responses, answers): indices = [pos for pos, char in enumerate(response) if char == "$"] if len(indices) <= 2: scores.append(0) continue else: result = response[indices[-2] + 1 : indices[-1]] gold = get_answer(answer) scores.append(1.0 * is_equiv(result, gold)) return scores def math23k(responses: list[str], answers: list[str]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) gold = extract_numeric(answer, pattern=NUMERIC_IN_ZH) scores.append(1.0 * (pred == gold)) return scores def gsm8k_zh(responses: list[str], answers: list[str]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) gold = extract_numeric(answer) scores.append(1.0 * (pred == gold)) return scores def svamp(responses: list[float], answers: list[str]): scores = [] for response, answer in zip(responses, answers): pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) gold = answer scores.append(1.0 * (float(pred) == gold)) return scores def mmlu(responses, answers): scores = [] for response, answer in zip(responses, answers): pred = extract_choice_ans(response) gold = answer.lower() scores.append(1.0 * (pred == gold)) return scores import evaluate import numpy as np import datasets # TODO: Add BibTeX citation _CITATION = """\ @InProceedings{huggingface:module, title = {A great new module}, authors={huggingface, Inc.}, year={2020} } """ # TODO: Add description of the module here _DESCRIPTION = """\ A simple measurement that returns the number of elements in dataset. """ # TODO: Add description of the arguments of the module here _KWARGS_DESCRIPTION = """ Calculates number of elements in dataset Args: data: list of elements. Returns: element_count: number of elements in dataset, Examples: >>> measure = evaluate.load("lvwerra/element_count") >>> measure.compute(["a", "b", "c") {"element_count": 3} """ # TODO: Define external resources urls if needed BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class ReasoningMetric(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): features = datasets.Features( { "responses": datasets.Value("string"), "references": datasets.Value("string"), } ) if self.config_name == "svamp": features = datasets.Features( { "responses": datasets.Value("string"), "references": datasets.Value("float"), } ) # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.EvaluationModuleInfo( # This is the description that will appear on the modules page. # module_type="measurement", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=features, # Homepage of the module for documentation homepage="http://module.homepage", # Additional links to the codebase or references codebase_urls=["http://github.com/path/to/codebase/of/new_module"], reference_urls=["http://path.to.reference.url/new_module"], ) def _compute(self, responses, references, verbose=False): results = {} scores = getattr(Metrics, self.config_name)(responses, references) acc = np.asarray(scores).mean() results = { "accuracy": acc, "scores": scores, } if verbose: results["references"] = references results["answers"] = responses # results["scores"] = scores return results class Suite(EvaluationSuite): def run( self, model_or_pipeline: Any, prompt: str = "{instruction}" ) -> dict[str, float]: self.assert_suite_nonempty() results_all = {} for task in tqdm(self.suite, desc="Running tasks"): task_name = task.name results = task.run(model_or_pipeline) results_all[task_name] = results return results_all def __init__(self, name): super().__init__(name) self.suite = [ Task( dataset_name=("gsm8k", "main"), metric_name=("sustech/tlem", "gsm8k"), input_column="question", label_column="answer", ) # TASK_REGISTRY["gsm8k"], # TASK_REGISTRY["competition_math"], ] # %% if __name__ == "__main__": # metric = load("sustech/tlem", "gsm8k") # output = metric.compute(responses=["answer is 2", "1+2"], references=["2", "3"]) # logging.info(output) suite = EvaluationSuite.load("sustech/tlem") suite.run(fake_pipeline) # %%