Spaces:

SUSTech
/

tlem

Running

App Files Files Community

facat commited on Sep 6, 2023

Commit

8af54b8

•

1 Parent(s): 507319c

upd

Browse files

Files changed (4) hide show

.gitignore +2 -0
pyproject.toml +14 -0
tlem.py +225 -0
utils.py +257 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ tlem.ju.py

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[tool.poetry]
+name = "tlem"
+version = "0.1.0"
+description = ""
+authors = ["fecet <xiezej@gmail.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "3.10"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

tlem.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# %%
+try:
+    from ipytorch import logging
+except Exception as e:
+    import logging
+from typing import Any, Optional, Protocol, Iterable, Callable
+# %%
+# %cd ../tlem
+# %load_ext ipytorch
+# %ls
+from utils import (
+    NUMERIC_IN_ZH,
+    extract_choice_ans,
+    extract_numeric,
+    get_answer,
+    is_equiv,
+)
+from dataclasses import dataclass, field
+from datasets import load_dataset, Dataset
+from functools import cached_property
+TextGenerationPipeline = Callable[[Iterable[str]], list[str]]
+from evaluate import EvaluationModule, Evaluator, evaluator, load
+@dataclass
+class Task:
+    dataset_name: str = "gsm8k"
+    dataset_params: dict = field(default_factory=dict)
+    # metrics: list[str] = field(default_factory=list)
+    metric_name: str | tuple[str, str] = "gsm8k"
+    input_column: str = "question"
+    label_column: str
+    prompt: Optional[Callable | str] = None
+    @cached_property
+    def samples(self):
+        return self.dataset[self.input_column]
+    @cached_property
+    def dataset(self):
+        ds = load_dataset(self.dataset_name, **self.dataset_params)
+        if self.prompt is not None:
+            ds = ds.map(
+                lambda example: {
+                    self.input_column: self.prompt.format(
+                        input_column=example[self.input_column]
+                    )
+                }
+                if isinstance(self.prompt, str)
+                else self.prompt(example),
+            )
+        return ds
+    @cached_property
+    def metric(self):
+        metric = (
+            load(self.metric_name)
+            if isinstance(self.metric_name, str)
+            else load(*self.metric_name)
+        )
+        return metric
+    def run(self, pipeline: TextGenerationPipeline):
+        outputs = pipeline(self.samples)
+        return self.metric.compute(outputs, self.dataset[self.label_column])
+class Metrics:
+    def gsm8k(responses: list[str], answers: list[str | int]):
+        scores = []
+        for response, answer in zip(responses, answers):
+            pred = extract_numeric(response)
+            gold = extract_numeric(answer) if isinstance(answer, str) else str(answer)
+            scores.append(1.0 * (pred == gold))
+        return scores
+    def MATH(responses: list[str], answers: list[str]):
+        scores = []
+        for response, answer in zip(responses, answers):
+            indices = [pos for pos, char in enumerate(response) if char == "$"]
+            if len(indices) <= 2:
+                scores.append(0)
+                continue
+            else:
+                result = response[indices[-2] + 1 : indices[-1]]
+                gold = get_answer(answer)
+                scores.append(1.0 * is_equiv(result, gold))
+        return scores
+    def math23k(responses: list[str], answers: list[str]):
+        scores = []
+        for response, answer in zip(responses, answers):
+            pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
+            gold = extract_numeric(answer, pattern=NUMERIC_IN_ZH)
+            scores.append(1.0 * (pred == gold))
+        return scores
+    def gsm8k_zh(responses: list[str], answers: list[str]):
+        scores = []
+        for response, answer in zip(responses, answers):
+            pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
+            gold = extract_numeric(answer)
+            scores.append(1.0 * (pred == gold))
+        return scores
+    def svamp(responses: list[float], answers: list[str]):
+        scores = []
+        for response, answer in zip(responses, answers):
+            pred = extract_numeric(response, pattern=NUMERIC_IN_ZH)
+            gold = answer
+            scores.append(1.0 * (float(pred) == gold))
+        return scores
+    def mmlu(responses, answers):
+        scores = []
+        for response, answer in zip(responses, answers):
+            pred = extract_choice_ans(response)
+            gold = answer.lower()
+            scores.append(1.0 * (pred == gold))
+        return scores
+import evaluate
+import numpy as np
+import datasets
+# TODO: Add BibTeX citation
+_CITATION = """\
+@InProceedings{huggingface:module,
+title = {A great new module},
+authors={huggingface, Inc.},
+year={2020}
+}
+"""
+# TODO: Add description of the module here
+_DESCRIPTION = """\
+A simple measurement that returns the number of elements in dataset.
+"""
+# TODO: Add description of the arguments of the module here
+_KWARGS_DESCRIPTION = """
+Calculates number of elements in dataset
+Args:
+    data: list of elements.
+Returns:
+    element_count: number of elements in dataset,
+Examples:
+    >>> measure = evaluate.load("lvwerra/element_count")
+    >>> measure.compute(["a", "b", "c")
+    {"element_count": 3}
+"""
+# TODO: Define external resources urls if needed
+BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class ReasoningMetric(evaluate.Metric):
+    """TODO: Short description of my evaluation module."""
+    def _info(self):
+        features = datasets.Features(
+            {
+                "responses": datasets.Value("string"),
+                "references": datasets.Value("string"),
+            }
+        )
+        if self.config_name == "svamp":
+            features = datasets.Features(
+                {
+                    "responses": datasets.Value("string"),
+                    "references": datasets.Value("float"),
+                }
+            )
+        # TODO: Specifies the evaluate.EvaluationModuleInfo object
+        return evaluate.EvaluationModuleInfo(
+            # This is the description that will appear on the modules page.
+            # module_type="measurement",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=features,
+            # Homepage of the module for documentation
+            homepage="http://module.homepage",
+            # Additional links to the codebase or references
+            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
+            reference_urls=["http://path.to.reference.url/new_module"],
+        )
+    def _compute(self, responses, references, verbose=False):
+        results = {}
+        scores = getattr(Metrics, self.config_name)(responses, references)
+        acc = np.asarray(scores).mean()
+        results = {
+            "accuracy": acc,
+            "scores": scores,
+        }
+        if verbose:
+            results["references"] = references
+            results["answers"] = responses
+            # results["scores"] = scores
+        return results

utils.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import logging
+import re
+NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
+NUMERIC_IN_ZH = (
+    r"(?:\D|^)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?=\D|$)"
+)
+def extract_choice_ans(text):
+    pattern1 = r"\b[ABCDabcd]\b"
+    pattern2 = r"\([ABCDabcd]\)"
+    matches1 = re.findall(pattern1, text)
+    matches2 = re.findall(pattern2, text)
+    matches = matches1 + matches2
+    def standardize(ans):
+        return ans if len(ans) == 1 else ans[1]
+    return standardize(matches[-1]).lower() if matches else "_"
+def extract_numeric(string, pattern=NUMERIC_IN_EN) -> str:
+    all_values = list(
+        filter(lambda x: len(x.strip()) != 0 and x != "%", re.findall(pattern, string))
+    )
+    def standardize(x):
+        y = "".join(x.split(","))
+        if "." in y:
+            y = y.rstrip("0")
+            if y[-1] == ".":
+                y = y[:-1]
+        if y[0] == ".":
+            y = "0" + y
+        if y[-1] == "%":
+            y = str(eval(y[:-1]) / 100)
+        return y
+    if not len(all_values):
+        logging.debug(f"No numeric value found in string: {string}")
+        value = string
+    else:
+        value = standardize(all_values[-1].strip())
+    return value
+def remove_boxed(s):
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+    left = "\\boxed{"
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+    return s[len(left) : -1]
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+    return retval
+def fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+def remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+def fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+def fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except Exception as e:
+        return string
+def strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    # remove units (on the right)
+    string = remove_right_units(string)
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")  # noqa: W605
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    # fix sqrt3 --> sqrt{3}
+    string = fix_sqrt(string)
+    # remove spaces
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    # string = fix_a_slash_b(string)
+    return string
+def get_answer(string):
+    try:
+        answer = remove_boxed(last_boxed_only_string(string))
+        # answer = strip_string(answer)
+    except Exception:
+        answer = string
+    return answer
+def is_equiv(str1, str2, verbose=False):
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return False
+    if str1 is None or str2 is None:
+        return False
+    try:
+        ss1 = strip_string(str1)
+        ss2 = strip_string(str2)
+        if verbose:
+            print(ss1, ss2)
+        return ss1 == ss2
+    except Exception:
+        return str1 == str2
+if __name__ == "__main__":
+    num = extract_numeric("the answer is -1.5")
+    print(num)