Spaces:

SUSTech
/

tlem

Running

App Files Files Community

facat commited on Nov 28, 2023

Commit

9827786

1 Parent(s): 6d6787f

refactor

Browse files

Files changed (3) hide show

tasks.py +11 -12
tlem.py +23 -26
utils.py +32 -159

tasks.py CHANGED Viewed

@@ -261,18 +261,17 @@ class Metrics:
         return responses, answers
     def MATH(responses: list[str], answers: list[str]):
-        extract_responses = []
-        for response in responses:
-            indices = [pos for pos, char in enumerate(response) if char == "$"]
-            if len(indices) <= 2:
-                ans = ""
-            else:
-                ans = response[indices[-2] + 1 : indices[-1]]
-            extract_responses.append(strip_string(ans))
-        extract_answers=[]
-        for answer in answers:
-            extract_answers.append(strip_string(get_answer(answer)))
-        return extract_responses, extract_answers
 class CMMLU:

         return responses, answers
     def MATH(responses: list[str], answers: list[str]):
+        extract_responses = sync_pipe(get_answer)(responses)
+        extract_answers = sync_pipe(get_answer)(answers)
+        try:
+            from math_equivalence import is_equiv
+        except ImportError as e:
+            logging.warning(
+                "math_equivalence not installed, pip install git+https://github.com/hendrycks/math.git"
+            )
+            raise e
+        return sync_pipe(is_equiv)(zip(extract_responses, extract_answers))
 class CMMLU:

tlem.py CHANGED Viewed

@@ -1,12 +1,9 @@
-# %%
 try:
     from ipytorch import logging
 except Exception as e:
     import logging
 from typing import Any, Optional, Protocol, Iterable, Callable
-from numpy.lib import extract
 from tqdm.auto import tqdm
 from evaluate.evaluation_suite import EvaluationSuite
 import evaluate
@@ -14,7 +11,7 @@ import numpy as np
 import datasets
 import pandas as pd
 from .tasks import *
-from .utils import is_equiv
 class ReasoningMetric(evaluate.Metric):
@@ -46,27 +43,27 @@ class ReasoningMetric(evaluate.Metric):
             reference_urls=["http://path.to.reference.url/new_module"],
         )
-    def _compute(self, responses, references, verbose=False):
-        extract_responses, extract_references = getattr(Metrics, self.config_name)(
-            responses, references
-        )
-        df = pd.DataFrame(
-            {
-                "responses": responses,
-                "references": references,
-            }
-        )
-        df["extract_responses"] = extract_responses
-        df["extract_references"] = extract_references
-        # print(df)
-        results = {
-            "Accuracy": (df["extract_references"] == df["extract_responses"])
-            .astype(int)
-            .mean(),
-        }
-        logging.info(results)
-        if verbose:
-            results["df"] = df
         return results
@@ -139,7 +136,7 @@ class Suite(EvaluationSuite):
                 suite = Task(
                     dataset_name="hendrycks/competition_math",
                     split="test",
-                    prompt="This is a math problem, please think step by step and slove it: {input_column}",
                     metric_name=("sustech/tlem", "MATH"),
                     input_column="problem",
                     label_column="solution",

 try:
     from ipytorch import logging
 except Exception as e:
     import logging
 from typing import Any, Optional, Protocol, Iterable, Callable
 from tqdm.auto import tqdm
 from evaluate.evaluation_suite import EvaluationSuite
 import evaluate
 import datasets
 import pandas as pd
 from .tasks import *
+from .utils import *
 class ReasoningMetric(evaluate.Metric):
             reference_urls=["http://path.to.reference.url/new_module"],
         )
+    def _compute(self, responses, references):
+        return_value = getattr(Metrics, self.config_name)(responses, references)
+        match return_value:
+            case tuple():
+                extract_responses, extract_references = return_value
+                results = {
+                    self.config_name: np.mean(
+                        sync_pipe(lambda x, y: x == y)(
+                            zip(extract_responses, extract_references)
+                        )
+                    )
+                }
+            case dict():
+                results = return_value
+            case list():
+                results = {self.config_name: np.mean(return_value)}
+            case _:
+                raise NotImplementedError
         return results
                 suite = Task(
                     dataset_name="hendrycks/competition_math",
                     split="test",
+                    prompt="This is a math problem, please think step by step and slove it: {input_column}, simplify your final answer as much as possible and surround them with $ in TeX form",
                     metric_name=("sustech/tlem", "MATH"),
                     input_column="problem",
                     label_column="solution",

utils.py CHANGED Viewed

@@ -2,6 +2,8 @@ import logging
 import re
 import numpy as np
 from typing import Any
 NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
 NUMERIC_IN_ZH = (
@@ -9,6 +11,28 @@ NUMERIC_IN_ZH = (
 )
 def return_blank_if_exception(func):
     def wrapper(*args, **kwargs):
         try:
@@ -155,166 +179,15 @@ def last_boxed_only_string(string):
     return retval
-def fix_sqrt(string):
-    if "\\sqrt" not in string:
-        return string
-    splits = string.split("\\sqrt")
-    new_string = splits[0]
-    for split in splits[1:]:
-        if split[0] != "{":
-            a = split[0]
-            new_substr = "\\sqrt{" + a + "}" + split[1:]
-        else:
-            new_substr = "\\sqrt" + split
-        new_string += new_substr
-    return new_string
-def remove_right_units(string):
-    # "\\text{ " only ever occurs (at least in the val set) when describing units
-    if "\\text{ " in string:
-        splits = string.split("\\text{ ")
-        # assert len(splits) == 2
-        return splits[0]
-    else:
-        return string
-def fix_fracs(string):
-    substrs = string.split("\\frac")
-    new_str = substrs[0]
-    if len(substrs) > 1:
-        substrs = substrs[1:]
-        for substr in substrs:
-            new_str += "\\frac"
-            if substr[0] == "{":
-                new_str += substr
-            else:
-                try:
-                    assert len(substr) >= 2
-                except AssertionError:
-                    return string
-                a = substr[0]
-                b = substr[1]
-                if b != "{":
-                    if len(substr) > 2:
-                        post_substr = substr[2:]
-                        new_str += "{" + a + "}{" + b + "}" + post_substr
-                    else:
-                        new_str += "{" + a + "}{" + b + "}"
-                else:
-                    if len(substr) > 2:
-                        post_substr = substr[2:]
-                        new_str += "{" + a + "}" + b + post_substr
-                    else:
-                        new_str += "{" + a + "}" + b
-    string = new_str
-    return string
-def fix_a_slash_b(string):
-    if len(string.split("/")) != 2:
-        return string
-    a = string.split("/")[0]
-    b = string.split("/")[1]
-    try:
-        a = int(a)
-        b = int(b)
-        assert string == "{}/{}".format(a, b)
-        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
-        return new_string
-    except Exception as e:
-        return string
-def strip_string(string):
-    # linebreaks
-    string = string.replace("\n", "")
-    # remove inverse spaces
-    string = string.replace("\\!", "")
-    # replace \\ with \
-    string = string.replace("\\\\", "\\")
-    # replace tfrac and dfrac with frac
-    string = string.replace("tfrac", "frac")
-    string = string.replace("dfrac", "frac")
-    # remove \left and \right
-    string = string.replace("\\left", "")
-    string = string.replace("\\right", "")
-    # Remove circ (degrees)
-    string = string.replace("^{\\circ}", "")
-    string = string.replace("^\\circ", "")
-    # remove dollar signs
-    string = string.replace("\\$", "")
-    # remove units (on the right)
-    string = remove_right_units(string)
-    # remove percentage
-    string = string.replace("\\%", "")
-    string = string.replace("\%", "")  # noqa: W605
-    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
-    string = string.replace(" .", " 0.")
-    string = string.replace("{.", "{0.")
-    # if empty, return empty string
-    if len(string) == 0:
-        return string
-    if string[0] == ".":
-        string = "0" + string
-    # to consider: get rid of e.g. "k = " or "q = " at beginning
-    if len(string.split("=")) == 2:
-        if len(string.split("=")[0]) <= 2:
-            string = string.split("=")[1]
-    # fix sqrt3 --> sqrt{3}
-    string = fix_sqrt(string)
-    # remove spaces
-    string = string.replace(" ", "")
-    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
-    string = fix_fracs(string)
-    # manually change 0.5 --> \frac{1}{2}
-    if string == "0.5":
-        string = "\\frac{1}{2}"
-    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
-    # string = fix_a_slash_b(string)
-    string = string.split("=")[-1]
-    while string.startswith("\\boxed{") and string.endswith("}"):
-        string = string[7:-1]
-        string = string.split("=")[-1]
-    return string
 def get_answer(string):
-    answer = remove_boxed(last_boxed_only_string(string))
-    return answer
-def is_equiv(str1, str2, verbose=False):
-    if str1 is None and str2 is None:
-        print("WARNING: Both None")
-        return False
-    if str1 is None or str2 is None:
-        return False
-    try:
-        ss1 = strip_string(str1)
-        ss2 = strip_string(str2)
-        if verbose:
-            print(ss1, ss2)
-        return ss1 == ss2
-    except Exception:
-        return str1 == str2
 def first_option_postprocess(text: str, options: str) -> str:

 import re
 import numpy as np
 from typing import Any
+from tqdm.auto import tqdm
+import asyncio
 NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
 NUMERIC_IN_ZH = (
 )
+def async_pipe(func):
+    async def sync_function(samples):
+        if not isinstance(samples, list):
+            samples = [samples]
+        return await tqdm.gather(*[func(sample) for sample in samples], leave=False)
+    def sync_func(samples):
+        return asyncio.run(sync_function(samples))
+    return sync_func
+def sync_pipe(func, progress=False):
+    def sync_func(samples):
+        return [
+            func(*sample) if isinstance(sample, tuple) else func(sample)
+            for sample in tqdm(samples, disable=not progress, leave=False)
+        ]
+    return sync_func
 def return_blank_if_exception(func):
     def wrapper(*args, **kwargs):
         try:
     return retval
 def get_answer(string):
+    if boxed := last_boxed_only_string(string):
+        return remove_boxed(boxed)
+    else:
+        indices = [pos for pos, char in enumerate(string) if char == "$"]
+        if len(indices) < 2:
+            return extract_numeric(string)
+        string = string[indices[-2] + 1 : indices[-1]]
+        return string.split("=")[-1]
 def first_option_postprocess(text: str, options: str) -> str: