File size: 3,886 Bytes
8af54b8
 
 
 
 
 
 
 
a6d7b1c
 
4c7982b
 
 
 
 
8af54b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e01a5f6
 
a6d7b1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e01a5f6
a6d7b1c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# %%

try:
    from ipytorch import logging
except Exception as e:
    import logging

from typing import Any, Optional, Protocol, Iterable, Callable
from tqdm.auto import tqdm
from evaluate.evaluation_suite import EvaluationSuite
import evaluate
import numpy as np
import datasets
from tasks import Task, Metrics, fake_pipeline
from utils import is_equiv

# %%

# %cd ../tlem

# %load_ext ipytorch
# %ls


# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}
"""

# TODO: Add description of the module here
_DESCRIPTION = """\
A simple measurement that returns the number of elements in dataset.
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates number of elements in dataset
Args:
    data: list of elements.
Returns:
    element_count: number of elements in dataset,
Examples:
    >>> measure = evaluate.load("lvwerra/element_count")
    >>> measure.compute(["a", "b", "c")
    {"element_count": 3}
"""

# TODO: Define external resources urls if needed
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ReasoningMetric(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        features = datasets.Features(
            {
                "responses": datasets.Value("string"),
                "references": datasets.Value("string"),
            }
        )

        if self.config_name == "svamp":
            features = datasets.Features(
                {
                    "responses": datasets.Value("string"),
                    "references": datasets.Value("float"),
                }
            )

        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.EvaluationModuleInfo(
            # This is the description that will appear on the modules page.
            # module_type="measurement",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=features,
            # Homepage of the module for documentation
            homepage="http://module.homepage",
            # Additional links to the codebase or references
            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
            reference_urls=["http://path.to.reference.url/new_module"],
        )

    def _compute(self, responses, references, verbose=False):
        results = {}
        scores = getattr(Metrics, self.config_name)(responses, references)
        acc = np.asarray(scores).mean()
        results = {
            "accuracy": acc,
            "scores": scores,
        }

        if verbose:
            results["references"] = references
            results["answers"] = responses
            # results["scores"] = scores

        return results


class Suite(EvaluationSuite):
    def run(
        self, model_or_pipeline: Any, prompt: str = "{instruction}"
    ) -> dict[str, float]:
        self.assert_suite_nonempty()

        results_all = {}
        for task in tqdm(self.suite, desc="Running tasks"):
            task_name = task.name
            results = task.run(model_or_pipeline)
            results_all[task_name] = results
        return results_all

    def __init__(self, name):
        super().__init__(name)

        self.suite = [
            Task(
                dataset_name=("gsm8k", "main"),
                metric_name=("sustech/tlem", "gsm8k"),
                input_column="question",
                label_column="answer",
            )
            # TASK_REGISTRY["gsm8k"],
            # TASK_REGISTRY["competition_math"],
        ]


# %%