Spaces:

SUSTech
/

tlem

Running

App Files Files Community

FIX: MATH

by Cookize - opened Nov 27, 2023

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+518

-4170

Files changed (7) hide show

.gitignore +1 -5
README.md +4 -9
index.html +0 -0
pyproject.toml +0 -14
tasks.py +164 -188
tlem.py +53 -142
utils.py +278 -112

.gitignore CHANGED Viewed

@@ -1,6 +1,2 @@
 __pycache__
-*.ju.py
-tests
-README_files
-.ipynb_checkpoints


1	__pycache__
2	+ tlem.ju.py

README.md CHANGED Viewed

@@ -1,10 +1,5 @@
----
-title: TLEM
-emoji: 🐢
-colorFrom: yellow
-colorTo: purple
-sdk: static
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference










1
2	+
3	+ # Transparent LLMs Evaluation Metrics
4	+
5	+ > LLMs belong to tout le monde

index.html CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml DELETED Viewed

@@ -1,14 +0,0 @@
-[tool.poetry]
-name = "tlem"
-version = "0.1.0"
-description = ""
-authors = ["fecet <xiezej@gmail.com>"]
-readme = "README.md"
-[tool.poetry.dependencies]
-python = "^3.10"
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"

tasks.py CHANGED Viewed

@@ -14,7 +14,6 @@ from .utils import *
 from evaluate import load
 from collections import defaultdict
 import sys
-from pathlib import Path
 # if sys.version_info >= (3, 9):
@@ -59,15 +58,13 @@ class Task:
     dataset_name: str | tuple[str, str] = ("gsm8k", "main")
     split: str = "test"
     # metrics: list[str] = field(default_factory=list)
-    metric_name: str | tuple[str, str] = ("sustech/tlem", "mmlu")
     input_column: str = "question"
     label_column: str = ""
-    output_column: str = "generated_text"
     prompt: Optional[Callable | str] = None
     few_shot: int = 0
     few_shot_from: Optional[str] = None
     # results: dict[str, Any] = field(default_factory=dict)
-    # outputs: Optional[list] = field(default_factory=list)
     def __post_init__(self):
         names = (
@@ -75,7 +72,7 @@ class Task:
             if isinstance(self.dataset_name, str)
             else list(self.dataset_name)
         )
-        names[0] = Path(names[0]).name
         self.name = "-".join(names) + f"-{self.split}"
         if isinstance(self.prompt, str):
@@ -87,21 +84,10 @@ class Task:
             }
         self.label_column = self.label_column or self.input_column
-    def __eq__(self, __value: object) -> bool:
-        return self.name == __value.name
     @cached_property
     def samples(self):
         return self.dataset[self.input_column]
-    @cached_property
-    def labels(self):
-        return self.dataset[self.label_column]
-    @cached_property
-    def outputs(self):
-        return self.dataset[self.output_column]
     @cached_property
     def dataset(self):
         ds = (
@@ -132,7 +118,6 @@ class Task:
             shots = shots.map(
                 lambda example: {
                     self.input_column: example[self.input_column]
-                    + "\n"
                     + example[self.label_column],
                 }
             )[self.input_column]
@@ -155,37 +140,33 @@ class Task:
             if isinstance(self.metric_name, str)
             else load(*self.metric_name)
         )
-        return metric._compute
-    @property
-    def result(self) -> dict:
-        assert self.outputs, "Please run the task first."
-        results = self.metric(self.outputs, self.labels)
-        # logging.info(f"{self.name}:{results}")
-        return results
     def run(
         self,
         pipeline,
     ):
-        if self.output_column not in self.dataset.column_names:
-            self.dataset = self.dataset.add_column(
-                self.output_column, pipeline(self.samples)
-            )
-        return self.result
-    async def arun(self, pipeline):
-        self.dataset = self.dataset.add_column(
-            self.output_column, await pipeline(self.samples)
-        )
-        return self.result
-    def save(self, path):
-        self.dataset.select_columns(
-            [self.input_column, self.output_column, self.label_column]
-        ).save_to_disk(path)
 def multichoice(responses: Any, references: list[str]):
@@ -209,8 +190,10 @@ def multichoice_zh(responses: Any, references: list[str]):
 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
-    truthful_qa_mc1 = multichoice
-    ceval = multichoice_zh
     def winogrande(responses: list[str], answers: list[str | int]):
         responses = [first_option_postprocess(pred, options="AB") for pred in responses]
@@ -235,11 +218,18 @@ class Metrics:
         return responses, answers
     def drop(responses: list[str], answers: list[list]):
-        scores = []
-        for pred, ans in zip(responses, answers):
-            score = np.mean([1 if a in pred else 0 for a in ans])
-            scores.append(score)
-        return {"em": np.mean(scores)}
     def bbh_mcq(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
@@ -270,25 +260,17 @@ class Metrics:
         return responses, answers
-    def boolq(responses: list[str], answers: list[str | int]):
-        responses = [first_capital_postprocess(response) for response in responses]
-        answers = ["A" if answer else "B" for answer in answers]
-        return responses, answers
     def MATH(responses: list[str], answers: list[str]):
-        extract_responses = sync_pipe(get_answer)(responses)
-        extract_answers = sync_pipe(get_answer)(answers)
-        try:
-            from math_equivalence import is_equiv
-        except ImportError as e:
-            logging.error(
-                "math_equivalence not installed, pip install git+https://github.com/hendrycks/math.git"
-            )
-            raise e
-        return sync_pipe(is_equiv)(zip(extract_responses, extract_answers))
 class CMMLU:
@@ -301,7 +283,7 @@ class CMMLU:
         for choice in list("ABCD"):
             prompt += f"\n{choice}. {example[choice]}"
-        prompt += "\n答案："
         return {"prompt": prompt}
     subcategories = {
@@ -398,6 +380,7 @@ class CMMLU:
         ],
         "Other": ["other"],
         "China specific": ["china specific"],
     }
     @classmethod
@@ -624,29 +607,73 @@ class DROP:
     input_column = "input"
     label_column = "answers"
     @classmethod
     def prompt_drop(cls, example):
-        prompt = f"Read the following passage and answer the question.\n\n{example['passage']}\n\nQuestion: {example['question']}"
-        return {
-            cls.input_column: prompt,
-            cls.label_column: ",".join(example["answers_spans"]["spans"]),
-        }
     @classmethod
     def suite(
         cls,
     ):
-        return Task(
-            "drop",
-            metric_name=("sustech/tlem", "drop"),
-            input_column=cls.input_column,
-            label_column=cls.label_column,
-            prompt=partial(cls.prompt_drop),
-            few_shot=3,
-            few_shot_from="train",
-            split="validation",
         )
 class HellaSwag:
@@ -717,17 +744,28 @@ class ARC:
     @classmethod
     def suite(cls):
-        suite = [
-            Task(
-                ("ai2_arc", subset),
-                metric_name=("sustech/tlem", "arc"),
-                input_column=cls.input_column,
-                label_column=cls.label_column,
-                prompt=partial(cls.prompt_arc),
-                few_shot=0,
             )
-            for subset in cls.categories
-        ]
         return suite
@@ -789,107 +827,45 @@ class BBH:
     def suite(
         cls,
     ):
-        suite = []
-        for cate in cls.bbh_multiple_choice_sets:
-            suite.append(
-                Task(
-                    ("lukaemon/bbh", cate),
-                    metric_name=("sustech/tlem", "bbh_mcq"),
-                    input_column=cls.input_column,
-                    label_column=cls.label_column,
-                    prompt=partial(cls.prompt_bbh, category=cate),
-                    few_shot=0,
                 )
-            )
-        for cate in cls.bbh_free_form_sets:
-            suite.append(
-                Task(
-                    ("lukaemon/bbh", cate),
-                    metric_name=("sustech/tlem", "bbh_freefrom"),
-                    input_column=cls.input_column,
-                    label_column=cls.label_column,
-                    prompt=partial(cls.prompt_bbh, category=cate),
-                    few_shot=0,
                 )
-            )
-        return suite
-class BoolQ:
-    input_column = "input"
-    label_column = "answer"
-    @classmethod
-    def prompt_boolq(cls, example, chat=False):
-        prompt = f"{example['passage']}\nQuestion: {example['question']}\nA. Yes\nB. No\nAnswer: "
-        return {"input": prompt}
-    @classmethod
-    def suite(cls, chat: bool):
-        suite = [
-            Task(
-                dataset_name="boolq",
-                metric_name=("sustech/tlem", "boolq"),
-                input_column=cls.input_column,
-                label_column=cls.label_column,
-                prompt=partial(cls.prompt_boolq, chat=chat),
-                few_shot=0 if chat else 5,
-                few_shot_from="train",
-                split="validation",
-            )
-        ]
         return suite
-class TruthfulQAMC1:
-    input_column = "input"
-    label_column = "answer"
-    @classmethod
-    def prompt_truthful_qa(cls, example):
-        target = example["mc1_targets"]
-        choices = target["choices"]
-        labels = target["labels"]
-        prompt = f"The following is a multiple-choice question. Please choose the most suitable one as the answer to this question.\n\n"
-        prompt += example["question"]
-        answer = []
-        for idx, choice, label in zip(list("ABCDEFGHIJ")[:len(choices)], choices, labels):
-            prompt += f"\n{idx}. {choice}"
-            if label == 1:
-                answer = idx
-        prompt += "\nAnswer: "
-        return {
-            "input": prompt,
-            "answer": answer
-        }
-    @classmethod
-    def suite(cls):
-        suite = [
-            Task(
-                dataset_name=("truthful_qa", "multiple_choice"),
-                metric_name=("sustech/tlem", "truthful_qa_mc1"),
-                input_column=cls.input_column,
-                label_column=cls.label_column,
-                prompt=partial(cls.prompt_truthful_qa),
-                few_shot=0,
-                split="validation",
-            )
-        ]
-        return suite
 class CEVAL:
     input_column = "input"

 from evaluate import load
 from collections import defaultdict
 import sys
 # if sys.version_info >= (3, 9):
     dataset_name: str | tuple[str, str] = ("gsm8k", "main")
     split: str = "test"
     # metrics: list[str] = field(default_factory=list)
+    metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k")
     input_column: str = "question"
     label_column: str = ""
     prompt: Optional[Callable | str] = None
     few_shot: int = 0
     few_shot_from: Optional[str] = None
     # results: dict[str, Any] = field(default_factory=dict)
     def __post_init__(self):
         names = (
             if isinstance(self.dataset_name, str)
             else list(self.dataset_name)
         )
+        names[0] = names[0].split("/")[-1]
         self.name = "-".join(names) + f"-{self.split}"
         if isinstance(self.prompt, str):
             }
         self.label_column = self.label_column or self.input_column
     @cached_property
     def samples(self):
         return self.dataset[self.input_column]
     @cached_property
     def dataset(self):
         ds = (
             shots = shots.map(
                 lambda example: {
                     self.input_column: example[self.input_column]
                     + example[self.label_column],
                 }
             )[self.input_column]
             if isinstance(self.metric_name, str)
             else load(*self.metric_name)
         )
+        return metric
+    # @cache
     def run(
         self,
         pipeline,
     ):
+        if (outputs := pipeline(self.samples)) is None:
+            logging.warning("pipeline returns None")
+            return
+        self.outputs = outputs
+        try:
+            try:
+                result = self.metric._compute(
+                    responses=outputs, references=self.dataset[self.label_column]
+                )
+            except Exception as e:
+                result = self.metric.compute(
+                    responses=outputs, references=self.dataset[self.label_column]
+                )
+        except Exception as e:
+            result = outputs
+        # if log:
+        #     name = name or pipeline.__name__
+        #     self.results[name] = result
+        return result
 def multichoice(responses: Any, references: list[str]):
 class Metrics:
     cmmlu = multichoice_zh
     mmlu = multichoice
+    def ceval(responses: list[str], answers: list[str | int]):
+        responses = [extract_choice_zh(pred) for pred in responses]
+        return responses, answers
     def winogrande(responses: list[str], answers: list[str | int]):
         responses = [first_option_postprocess(pred, options="AB") for pred in responses]
         return responses, answers
     def drop(responses: list[str], answers: list[list]):
+        if len(responses) != len(answers):
+            return {"error": "predictions and references have different " "length"}
+        responses = [general_postprocess(pred) for pred in responses]
+        processed_answers = [[general_postprocess(j) for j in i] for i in answers]
+        matched_answers = []
+        for pred, ans, origin_ans in zip(responses, processed_answers, answers):
+            if pred in ans or pred in origin_ans:
+                matched_answers.append(pred)
+            else:
+                matched_answers.append(ans[0])
+        return responses, matched_answers
     def bbh_mcq(responses: list[str], answers: list[str | int]):
         if len(responses) != len(answers):
         return responses, answers
     def MATH(responses: list[str], answers: list[str]):
+        extract_responses = []
+        for response in responses:
+            indices = [pos for pos, char in enumerate(response) if char == "$"]
+            if len(indices) <= 2:
+                ans = ""
+            else:
+                ans = response[indices[-2] + 1 : indices[-1]]
+            extract_responses.append(strip_string(ans))
+        extract_answers = [strip_string(get_answer(answer)) for answer in answers]
+        return extract_responses, extract_answers
 class CMMLU:
         for choice in list("ABCD"):
             prompt += f"\n{choice}. {example[choice]}"
+            prompt += "\n答案："
         return {"prompt": prompt}
     subcategories = {
         ],
         "Other": ["other"],
         "China specific": ["china specific"],
+        "Test": ["computer science"],
     }
     @classmethod
     input_column = "input"
     label_column = "answers"
+    icl_prompt = """\
+Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
+Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
+Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
+Text: Playing in their second straight Thanksgiving game, the Eagles struggled especially on defense, where they were unable to stop the much-hyped Lions offense. The worst of it all was how unproven rookie Eric Rowe was tasked with covering wide receiver Calvin Johnson, leading to Johnson catching 3 touchdowns. Stafford’s five passing touchdowns, including three of them to Johnson was too much for the Eagles to overcome and for the second consecutive time this season, the Eagles gave up 45 points in a game. With the loss, the Eagles drop to 4-7 on the season and 6-1 when playing on Thanksgiving.
+Question: How many TD passes did Stafford throw other than to Johnson?
+Anawer: According to the text, Stafford threw 5 TD passes, 3 of which were to Johnson. 5-3=2. So the answer is 2.
+Text: [PROMPT]
+Question: [QUESTION]
+Anawer:"""
+    categories = ["validation"]
     @classmethod
     def prompt_drop(cls, example):
+        prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
+            "[QUESTION]", example["question"]
+        )
+        validated_answers = example["answers_spans"]["spans"]
+        validated_types = example["answers_spans"]["types"]
+        answers = []
+        for answer_item, answer_type in zip(validated_answers, validated_types):
+            # if answer_type == "number":
+            #     answers.append(answer_item)
+            # elif any(answer_item['date'][i] for i in ['day', 'month', 'year']):
+            #     d = [answer_item['date'][i] for i in ['day', 'month', 'year']]
+            #     answers.append(' '.join(d).strip())
+            # else:
+            #     for span in answer_item['spans']:
+            # answers.append(span)
+            answers.append(answer_item)
+        answers = list(set(answers))
+        return {cls.input_column: prompt, cls.label_column: answers}
     @classmethod
     def suite(
         cls,
     ):
+        finer_categories = (
+            pd.Series(cls.categories)  # noqa # type: ignore
+            .explode()
+            .reset_index()
+            .set_index(0)
+            .groupby(0)
+            .agg(list)["index"]
+            .to_dict()
         )
+        suite = defaultdict(list)
+        categories = list(finer_categories.keys())
+        for cate in categories:
+            suite[cate].append(
+                Task(
+                    ("drop", cate),
+                    metric_name=("sustech/tlem", "drop"),
+                    input_column=cls.input_column,
+                    label_column=cls.label_column,
+                    prompt=partial(cls.prompt_drop),
+                    few_shot=0,
+                    split="validation",
+                )
+            )
+        return suite
 class HellaSwag:
     @classmethod
     def suite(cls):
+        finer_categories = (
+            pd.Series(cls.categories)  # noqa # type: ignore
+            .explode()
+            .reset_index()
+            .set_index(0)
+            .groupby(0)
+            .agg(list)["index"]
+            .to_dict()
+        )
+        suite = defaultdict(list)
+        categories = list(finer_categories.keys())
+        for cate in categories:
+            suite[cate].append(
+                Task(
+                    ("ai2_arc", cate),
+                    metric_name=("sustech/tlem", "arc"),
+                    input_column=cls.input_column,
+                    label_column=cls.label_column,
+                    prompt=partial(cls.prompt_arc),
+                    few_shot=0,
+                )
             )
         return suite
     def suite(
         cls,
     ):
+        finer_categories = (
+            pd.Series(
+                cls.bbh_free_form_sets + cls.bbh_multiple_choice_sets
+            )  # noqa # type: ignore
+            .explode()
+            .reset_index()
+            .set_index(0)
+            .groupby(0)
+            .agg(list)["index"]
+            .to_dict()
+        )
+        suite = defaultdict(list)
+        categories = list(finer_categories.keys())
+        for cate in categories:
+            if cate in cls.bbh_multiple_choice_sets:
+                suite[cate].append(
+                    Task(
+                        ("lukaemon/bbh", cate),
+                        metric_name=("sustech/tlem", "bbh_mcq"),
+                        input_column=cls.input_column,
+                        label_column=cls.label_column,
+                        prompt=partial(cls.prompt_bbh, category=cate),
+                        few_shot=0,
+                    )
                 )
+            else:
+                suite[cate].append(
+                    Task(
+                        ("lukaemon/bbh", cate),
+                        metric_name=("sustech/tlem", "bbh_freefrom"),
+                        input_column=cls.input_column,
+                        label_column=cls.label_column,
+                        prompt=partial(cls.prompt_bbh, category=cate),
+                        few_shot=0,
+                    )
                 )
         return suite
 class CEVAL:
     input_column = "input"

tlem.py CHANGED Viewed

@@ -1,6 +1,12 @@
-import logging
 from typing import Any, Optional, Protocol, Iterable, Callable
 from tqdm.auto import tqdm
 from evaluate.evaluation_suite import EvaluationSuite
 import evaluate
@@ -8,10 +14,7 @@ import numpy as np
 import datasets
 import pandas as pd
 from .tasks import *
-from .utils import *
-from itertools import chain
-from copy import deepcopy
-from . import utils
 class ReasoningMetric(evaluate.Metric):
@@ -43,60 +46,32 @@ class ReasoningMetric(evaluate.Metric):
             reference_urls=["http://path.to.reference.url/new_module"],
         )
-    def _compute(self, responses, references):
-        return_value = getattr(Metrics, self.config_name)(responses, references)
-        match return_value:
-            case extract_responses, extract_references:
-                results = {
-                    self.config_name: np.mean(
-                        sync_pipe(lambda x, y: x == y)(
-                            zip(extract_responses, extract_references)
-                        )
-                    )
-                }
-            case dict():
-                results = return_value
-            case list():
-                results = {self.config_name: np.mean(return_value)}
-            case _:
-                raise NotImplementedError
         return results
 class Suite(EvaluationSuite):
     task_class = Task
-    utils = utils
-    supported_datasets = [
-        "arc",
-        "hellaswag",
-        "mmlu-chat",
-        "winogrande",
-        "gsm8k",
-        "cmmlu-chat",
-        "ceval-chat",
-        "bbh",
-        "drop",
-        "MATH",
-    ]
-    def __getitem__(self, key) -> Task:
-        match key:
-            case str():
-                return self.suite[key]
-            case slice() | int():
-                return self.tasks[key]
-    def agg(self, suite):
-        for cate, tasks in suite.items():
-            if isinstance(tasks, dict):
-                suite[cate] = self.agg(tasks)
-            else:
-                suite[cate] = np.mean([pd.Series(task.result).mean() for task in tasks])
-        return suite
     def run(
         self,
@@ -104,26 +79,28 @@ class Suite(EvaluationSuite):
     ) -> dict[str, float]:
         self.assert_suite_nonempty()
-        self.suite: dict[str, list[Task]]
-        for task in (bar := tqdm(self.tasks)):
-            bar.desc = f"complete {task.name}."
-            _ = task.run(model_or_pipeline)
-            logging.info(f"{task.name} {task.result=}")
-        return self.agg(deepcopy(self.suite))
-    def arun(self, model_or_pipeline):
-        async def sync_function():
-            return await tqdm.gather(
-                *[task.arun(model_or_pipeline) for task in self.tasks], leave=False
-            )
-        asyncio.run(sync_function())
-        return self.agg(deepcopy(self.suite))
-    def get_suite(self, name) -> dict[str, Task]:
         chat = False
-        suite={}
         match name:
             case _ if "chat" in name:
                 chat = True
@@ -132,8 +109,6 @@ class Suite(EvaluationSuite):
                 suite = MMLU.suite(chat=chat)
             case _ if name.startswith("cmmlu"):
                 suite = CMMLU.suite(chat=chat)
-            case _ if name.startswith("ceval"):
-                suite = CEVAL.suite(chat=chat)
             case "gsm8k":
                 suite = Task(
                     dataset_name=("gsm8k", "main"),
@@ -151,10 +126,8 @@ class Suite(EvaluationSuite):
                 suite = DROP.suite()
             case "winogrande":
                 suite = Winogrande.suite()
-            case "truthfulqa_mc1":
-                suite = TruthfulQAMC1.suite()
-            case _ if name.startswith("boolq"):
-                suite = BoolQ.suite(chat=chat)
             case "mt_bench":
                 suite = Task(
                     dataset_name="SUSTech/mt_bench_judge",
@@ -165,78 +138,16 @@ class Suite(EvaluationSuite):
             case "MATH" | "competition_math":
                 suite = Task(
                     dataset_name="hendrycks/competition_math",
-                    prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form.",
                     metric_name=("sustech/tlem", "MATH"),
                     input_column="problem",
                     label_column="solution",
                 )
-            case "open-leaderboard":
-                for name in [
-                    "arc",
-                    "hellaswag",
-                    "mmlu-chat",
-                    "winogrande",
-                    "gsm8k",
-                    # "truthful_qa",
-                    "drop",
-                ]:
-                    suite.update(self.get_suite(name))
-            case "tlem":
-                for name in [
-                    "arc",
-                    "hellaswag",
-                    "mmlu-chat",
-                    "winogrande",
-                    "gsm8k",
-                    # "truthful_qa",
-                    "cmmlu-chat",
-                    "ceval-chat",
-                    "bbh",
-                ]:
-                    suite.update(self.get_suite(name))
-            case "all":
-                for name in self.supported_datasets:
-                    suite.update(self.get_suite(name))
-            case _:
-                raise NotImplementedError(
-                    f"{name} is not supported in {self.supported_datasets}"
-                )
-        if isinstance(suite, Task):
-            suite = [suite]
-        suite = {name: suite}
-        return suite
-    def singleton(self, task):
-        try:
-            return self.tasks[self.tasks.index(task)]
-        except ValueError:
-            logging.debug(f"add {task.name} to suite.")
-            self.tasks.append(task)
-            logging.debug(self.tasks)
-            return self.tasks[-1]
-    def drop_duplicates(self, suite):
-        for category, tasks in suite.items():
-            match tasks:
-                case list():
-                    suite[category] = [self.singleton(task) for task in tasks]
-                case dict():
-                    suite[category] = self.drop_duplicates(tasks)
-                case _:
-                    raise NotImplementedError
-        return suite
-    def load(self, name):
-        sub_suite = self.get_suite(name)
-        self.suite.update(sub_suite)
-        self.suite = self.drop_duplicates(self.suite)
-        # return self
     def __init__(self, name="tlem"):
         super().__init__(name)
-        self.tasks = []
-        self.suite = {}

+# %%
+try:
+    from ipytorch import logging
+except Exception as e:
+    import logging
 from typing import Any, Optional, Protocol, Iterable, Callable
+from numpy.lib import extract
 from tqdm.auto import tqdm
 from evaluate.evaluation_suite import EvaluationSuite
 import evaluate
 import datasets
 import pandas as pd
 from .tasks import *
+from .utils import is_equiv
 class ReasoningMetric(evaluate.Metric):
             reference_urls=["http://path.to.reference.url/new_module"],
         )
+    def _compute(self, responses, references, verbose=False):
+        extract_responses, extract_references = getattr(Metrics, self.config_name)(
+            responses, references
+        )
+        df = pd.DataFrame(
+            {
+                "responses": responses,
+                "references": references,
+            }
+        )
+        df["extract_responses"] = extract_responses
+        df["extract_references"] = extract_references
+        # print(df)
+        results = {
+            "Accuracy": (df["extract_references"] == df["extract_responses"])
+            .astype(int)
+            .mean(),
+        }
+        logging.info(results)
+        if verbose:
+            results["df"] = df
         return results
 class Suite(EvaluationSuite):
     task_class = Task
     def run(
         self,
     ) -> dict[str, float]:
         self.assert_suite_nonempty()
+        def run_tasks(tasks):
+            for task in (bar := tqdm(tasks, leave=False)):
+                bar.desc = f"complete {task.name}."
+                if task.name not in self.cached_result:
+                    self.cached_result[task.name] = task.run(model_or_pipeline)
+            results = [self.cached_result[task.name] for task in tasks]
+            return pd.DataFrame(results).mean().to_dict()
+        if isinstance(self.suite, dict):
+            for category, tasks in (bar := tqdm(self.suite.items())):
+                bar.desc = f"complete {category}."
+                logging.warning(f"Combined results {category}: {run_tasks(tasks)}")
+        else:
+            logging.warning(f"Combined results: {run_tasks(self.suite)}")
+        return self.cached_result
+    def add(self, name):
+        self.load(name)
+    def load(self, name):
         chat = False
         match name:
             case _ if "chat" in name:
                 chat = True
                 suite = MMLU.suite(chat=chat)
             case _ if name.startswith("cmmlu"):
                 suite = CMMLU.suite(chat=chat)
             case "gsm8k":
                 suite = Task(
                     dataset_name=("gsm8k", "main"),
                 suite = DROP.suite()
             case "winogrande":
                 suite = Winogrande.suite()
+            case _ if name.startswith("ceval"):
+                suite = CEVAL.suite(chat=chat)
             case "mt_bench":
                 suite = Task(
                     dataset_name="SUSTech/mt_bench_judge",
             case "MATH" | "competition_math":
                 suite = Task(
                     dataset_name="hendrycks/competition_math",
+                    split="test",
+                    prompt="This is a math problem, please think step by step and slove it: {input_column}",
                     metric_name=("sustech/tlem", "MATH"),
                     input_column="problem",
                     label_column="solution",
                 )
+        self.suite = [suite] if isinstance(suite, Task) else suite
     def __init__(self, name="tlem"):
         super().__init__(name)
+        self.cached_result = {}
+        self.suite = []

utils.py CHANGED Viewed

@@ -2,8 +2,6 @@ import logging
 import re
 import numpy as np
 from typing import Any
-from tqdm.auto import tqdm
-import asyncio
 NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
 NUMERIC_IN_ZH = (
@@ -11,38 +9,6 @@ NUMERIC_IN_ZH = (
 )
-def async_pipe(func):
-    async def sync_function(samples):
-        if not isinstance(samples, list):
-            samples = [samples]
-        return await tqdm.gather(*[func(sample) for sample in samples], leave=False)
-    def sync_func(samples):
-        return asyncio.run(sync_function(samples))
-    return sync_func
-def sync_pipe(func, progress=False):
-    def sync_func(samples):
-        return [
-            func(*sample) if isinstance(sample, tuple) else func(sample)
-            for sample in tqdm(samples, disable=not progress, leave=False)
-        ]
-    return sync_func
-def asis_backup(func):
-    def wrapper(sample):
-        try:
-            return func(sample)
-        except Exception:
-            return sample
-    return wrapper
 def extract_choice_zh(gen):
     # 答案是A | 选项是A | 应该选A选项
     res = re.search(
@@ -74,27 +40,27 @@ def extract_choice_zh(gen):
 def extract_choice(gen):
     # answer is A | choice is A | choose A
     res = re.search(
-        r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCDEFGHIJKL]{0,20}?(?:n't|not))[^ABCDEFGHIJKL]{0,10}?\b(?:|is|:|be))\b)[^ABCDEFGHIJKL]{0,20}?\b(A|B|C|D|E|F|G|H|I|J|K|L)\b",
         gen,
     )
     # A is correct | A is right
     if res is None:
         res = re.search(
-            r"\b(A|B|C|D|E|F|G|H|I|J|K|L)\b(?![^ABCDEFGHIJKL]{0,8}?(?:n't|not)[^ABCDEFGHIJKL]{0,5}?(?:correct|right))[^ABCDEFGHIJKL]{0,10}?\b(?:correct|right)\b",
             gen,
         )
     # straight answer: A
     if res is None:
-        res = re.search(r"^(A|B|C|D|E|F|G|H|I|J|K|L)(?:\.|,|:|$)", gen)
     # simply extract the first appearred letter
     if res is None:
-        res = re.search(r"(?<![a-zA-Z])(A|B|C|D|E|F|G|H|I|J|K|L)(?![a-zA-Z=])", gen)
     if res is None:
-        res = "L"
     if isinstance(res, str):
         return res
@@ -136,20 +102,17 @@ def extract_numeric(string, pattern=NUMERIC_IN_EN) -> str:
 def remove_boxed(s):
-    try:
-        if (left := "\\boxed ") in s:
-            assert s[: len(left)] == left, s
-            return s[len(left) :]
-        elif (left := "\\boxed{") in s:
-            assert s[: len(left)] == left, s
-            return s[len(left) : -1]
-        elif (left := "\\fbox{") in s:
-            assert s[: len(left)] == left, s
-            return s[len(left) : -1]
-        raise ValueError(f"Cannot remove boxed from {s}")
-    except AssertionError:
-        return s
 def last_boxed_only_string(string):
@@ -182,33 +145,183 @@ def last_boxed_only_string(string):
     return retval
-@asis_backup
-def get_answer(string):
-    if boxed := last_boxed_only_string(string):
-        return remove_boxed(boxed)
     else:
-        indices = [pos for pos, char in enumerate(string) if char == "$"]
-        if len(indices) < 2:
-            return extract_numeric(string)
-        string = string[indices[-2] + 1 : indices[-1]]
-        return string.split("=")[-1]
 def first_option_postprocess(text: str, options: str) -> str:
     """Find first valid option for text."""
     patterns = [
-        f"[Tt]he answer is [{options}]",
-        f"[Tt]he correct answer\s?(?:option)?\s?is [{options}]",  # noqa
-        f"答案(?:选项)?是(.*?)[{options}]",
-        f"答案(?:选项)?为(.*?)[{options}]",
-        f"答案(?:选项)?选(.*?)[{options}]",
-        f"选项[{options}]是?正确",
-        f"选项[{options}]为?正确",
-        f"固选(.*?)[{options}]",
-        f"答案应该是(.*?)[{options}]",
-        f"(\s|^)[{options}][\s。，,\.$]",  # noqa
-        f"[{options}]",
     ]
     regexes = [re.compile(pattern) for pattern in patterns]
@@ -219,41 +332,44 @@ def first_option_postprocess(text: str, options: str) -> str:
             for i in options:
                 if i in outputs:
                     return i
-    return ""
 def first_capital_postprocess(text: str) -> str:
     for t in text:
         if t.isupper():
             return t
-    return ""
 def general_postprocess(text: str) -> str:
     # Cut off the first newline, period, or comma
-    truncated_text = re.split(r"[\n.,]", text, 1)[0]
     # Remove punctuation
-    no_punctuation = re.sub(r"[^\w\s]", "", truncated_text)
     # Remove article
-    no_articles = re.sub(r"\b(a|an|the)\b", "", no_punctuation, flags=re.IGNORECASE)
     # Remove duplicated blank spaces
-    cleaned_text = re.sub(r"\s+", " ", no_articles).strip()
     return cleaned_text
 def bbh_mcq_postprocess(text: str) -> str:
     ans = text
-    ans_line = ans.split("answer is ")
     if len(ans_line) != 1:
         ans = ans_line[-1].strip()
-    match = re.search(r"\(([A-Z])\)*", ans)
     if match:
         return f"({match.group(1)})"
-    match = re.search(r"([A-Z])", ans)
     if match:
         return f"({match.group(1)})"
     return f"({ans})"
@@ -261,17 +377,18 @@ def bbh_mcq_postprocess(text: str) -> str:
 def bbh_freeform_postprocess(text: str) -> str:
     ans = text
-    ans_line = ans.split("answer is ")
     if len(ans_line) != 1:
         ans = ans_line[-1].strip()
-    ans = ans.split("\n")[0]
-    if ans.endswith("."):
         ans = ans[:-1]
     return ans
 icl_prompts = {
-    "temporal_sequences": """Task description: Answer questions about which times certain events could have occurred.
 Q: Today, Emily went to the museum. Between what times could they have gone?
 We know that:
@@ -348,7 +465,9 @@ Wake-up time: 5am.
 3pm-4pm: free.
 The beach closure time: 4pm.
 The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
-    "disambiguation_qa": """Clarify the meaning of sentences with ambiguous pronouns.
 Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
 Sentence: The chief told the counselor that they took the day off.
@@ -385,7 +504,9 @@ Here we need to determine who the pronoun "his" might be referring to. There are
 Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
 Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
 Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
-    "date_understanding": """Infer the date from context.
 Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
 Options:
@@ -418,7 +539,9 @@ Options:
 (F) 12/03/1960
 A: Let's think step by step.
 If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
-    "tracking_shuffled_objects_three_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
 As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -458,7 +581,9 @@ A: Let's think step by step.
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
 At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
-    "penguins_in_a_table": """Answer questions about a table of penguins and their attributes.
 Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:
 James, 12, 90, 12
@@ -499,7 +624,9 @@ This question focuses on the name. We know the following: The names of the pengu
 When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
 The name of the second penguin sorted by alphabetical order is Gwen.
 The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
-    "geometric_shapes": """Name geometric shapes from their SVG paths.
 Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
 Options:
@@ -577,7 +704,9 @@ Length of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 +
 Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
 Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
 Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
-    "snarks": """Determine which of two sentences is sarcastic.
 According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
@@ -607,7 +736,9 @@ A: Let's think step by step.
 If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
 If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
 Above the above, the sarcastic option is (A). So the answer is (A).""",
-    "ruin_names": """Select the humorous edit that 'ruins' the input movie or musical artist name.
 Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
 Options:
@@ -651,7 +782,9 @@ The original name is "counting crows". This is the name of an American rock band
 (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
 (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
 Above the above, the only humorous edit is (B). So the answer is (B).""",
-    "tracking_shuffled_objects_seven_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
 As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -691,7 +824,9 @@ A: Let's think step by step.
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
 At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
-    "tracking_shuffled_objects_five_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
 As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -731,7 +866,9 @@ A: Let's think step by step.
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
 At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
-    "logical_deduction_three_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
 Options:
@@ -768,7 +905,9 @@ A: Let's think step by step.
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
 The white book is the leftmost. So the answer is (C).""",
-    "hyperbaton": """Order adjectives correctly in English sentences.
 Q: Which sentence has the correct adjective order:
 Options:
@@ -796,7 +935,9 @@ A: Let's think step by step.
 When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
 Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
 Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
-    "logical_deduction_five_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
 Options:
@@ -833,7 +974,9 @@ A: Let's think step by step.
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
 The white book is the leftmost. So the answer is (C).""",
-    "logical_deduction_seven_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
 Options:
@@ -870,7 +1013,9 @@ A: Let's think step by step.
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
 The white book is the leftmost. So the answer is (C).""",
-    "movie_recommendation": """Recommend movies similar to the given list of movies.
 Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
 Options:
@@ -912,7 +1057,9 @@ A: Let's think step by step.
 - Inside Out (animation, family, comedy; 2015)
 - Forrest Gump (comedy, drama, romance; 1994)
 These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
-    "salient_translation_error_detection": """Detect the type of error in an English translation of a German source sentence.
 Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
 Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
@@ -952,7 +1099,9 @@ Options:
 (F) Facts
 A: Let's think step by step.
 We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
-    "reasoning_about_colored_objects": """Answer extremely simple questions about the colors of objects on a surface.
 Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
 Options:
@@ -1015,7 +1164,8 @@ A: Let's think step by step.
 According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
 The teal plate is the first item, namely (1). There is no item to the left of the teal item.
 The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
-    "multistep_arithmetic_two": """Solve multi-step arithmetic problems.
 Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
 A: Let's think step by step.
@@ -1040,7 +1190,9 @@ This equation can be written as "A - B", where A = (-3 + 5 * 8 * -4) and B = (9
 Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
 Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
 Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
-    "navigate": """Given a series of navigation instructions, determine whether one would end up back at the starting point.
 Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
 Options:
@@ -1083,7 +1235,9 @@ We start at the origin (0, 0), facing the positive y-axis.
 (3) Take 9 steps left: (0, -7), facing the positive y-axis.
 (4) Take 7 steps right: (0, 7), facing the positive y-axis.
 Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
-    "dyck_languages": """Correctly close a Dyck-n word.
 Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
 A: Let's think step by step.
@@ -1155,7 +1309,9 @@ We should process each input one by one and keep track of the stack configuratio
 Now, we have reached the end. The final stack is "< [ {".
 We will need to pop out "{", "[", "<" one by one in that order.
 So, we need "}", "]", ">". So the answer is } ] >.""",
-    "word_sorting": """Sort a list of words.
 Q: Sort the following words alphabetically: List: oven costume counterpart
 A: Let's think step by step.
@@ -1172,7 +1328,9 @@ Q: Sort the following words alphabetically: List: newt arson parthia seismograph
 A: Let's think step by step.
 The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
 The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
-    "sports_understanding": """Determine whether an artificially constructed sentence relating to sports is plausible or not.
 Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
 A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
@@ -1182,7 +1340,9 @@ A: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part
 Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
 A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
-    "boolean_expressions": """Evaluate the result of a random Boolean expression.
 Q: not ( ( not not True ) ) is
 A: Let's think step by step.
@@ -1205,7 +1365,9 @@ Remember that (i) expressions inside brackets are always evaluated first and tha
 We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
 Let's evaluate A: A = not ( False ) = not False = True.
 Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
-    "object_counting": """Questions that involve enumerating objects and asking the model to count them.
 Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
 A: Let's think step by step.
@@ -1242,7 +1404,9 @@ We first identify the vegetables on the list and include their quantity in paren
 - garlic (1)
 - yam (1)
 Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
-    "formal_fallacies": """Distinguish deductively valid arguments from formal fallacies.
 Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
 Is the argument, given the explicitly stated premises, deductively valid or invalid?
@@ -1286,7 +1450,9 @@ By (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Ni
 The case X = rare-consumer(Nioxin) does not appear in (2).
 The case X = loyal-buyer(Caress) does not appear in (2), either.
 So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
-    "causal_judgement": """Answer questions about causal attribution.
 Q: How would a typical person answer each of the following questions about causation?
 Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
@@ -1311,7 +1477,7 @@ Options:
 - No
 A: Let's think step by step.
 Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
-    "web_of_lies": """Evaluate a random boolean function expressed as a word problem.
 Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
 A: Let's think step by step.

 import re
 import numpy as np
 from typing import Any
 NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
 NUMERIC_IN_ZH = (
 )
 def extract_choice_zh(gen):
     # 答案是A | 选项是A | 应该选A选项
     res = re.search(
 def extract_choice(gen):
     # answer is A | choice is A | choose A
     res = re.search(
+        r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b",
         gen,
     )
     # A is correct | A is right
     if res is None:
         res = re.search(
+            r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b",
             gen,
         )
     # straight answer: A
     if res is None:
+        res = re.search(r"^(A|B|C|D)(?:\.|,|:|$)", gen)
     # simply extract the first appearred letter
     if res is None:
+        res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
     if res is None:
+        res = "A"
     if isinstance(res, str):
         return res
 def remove_boxed(s):
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+    left = "\\boxed{"
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+    return s[len(left) : -1]
 def last_boxed_only_string(string):
     return retval
+def fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+def remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
     else:
+        return string
+def fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+def fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except Exception as e:
+        return string
+def strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    # remove units (on the right)
+    string = remove_right_units(string)
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")  # noqa: W605
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    # fix sqrt3 --> sqrt{3}
+    string = fix_sqrt(string)
+    # remove spaces
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    # string = fix_a_slash_b(string)
+    return string
+def get_answer(string):
+    try:
+        answer = remove_boxed(last_boxed_only_string(string))
+        # answer = strip_string(answer)
+    except Exception:
+        answer = string
+    return answer
+def is_equiv(str1, str2, verbose=False):
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return False
+    if str1 is None or str2 is None:
+        return False
+    try:
+        ss1 = strip_string(str1)
+        ss2 = strip_string(str2)
+        if verbose:
+            print(ss1, ss2)
+        return ss1 == ss2
+    except Exception:
+        return str1 == str2
 def first_option_postprocess(text: str, options: str) -> str:
     """Find first valid option for text."""
     patterns = [
+        f'[Tt]he answer is [{options}]',
+        f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]',  # noqa
+        f'答案(?:选项)?是(.*?)[{options}]',
+        f'答案(?:选项)?为(.*?)[{options}]',
+        f'答案(?:选项)?选(.*?)[{options}]',
+        f'选项[{options}]是?正确',
+        f'选项[{options}]为?正确',
+        f'固选(.*?)[{options}]',
+        f'答案应该是(.*?)[{options}]',
+        f'(\s|^)[{options}][\s。，,\.$]',  # noqa
+        f'[{options}]',
     ]
     regexes = [re.compile(pattern) for pattern in patterns]
             for i in options:
                 if i in outputs:
                     return i
+    return ''
 def first_capital_postprocess(text: str) -> str:
     for t in text:
         if t.isupper():
             return t
+    return ''
 def general_postprocess(text: str) -> str:
     # Cut off the first newline, period, or comma
+    truncated_text = re.split(r'[\n.,]', text, 1)[0]
     # Remove punctuation
+    no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
     # Remove article
+    no_articles = re.sub(r'\b(a|an|the)\b',
+                         '',
+                         no_punctuation,
+                         flags=re.IGNORECASE)
     # Remove duplicated blank spaces
+    cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
     return cleaned_text
 def bbh_mcq_postprocess(text: str) -> str:
     ans = text
+    ans_line = ans.split('answer is ')
     if len(ans_line) != 1:
         ans = ans_line[-1].strip()
+    match = re.search(r'\(([A-Z])\)*', ans)
     if match:
         return f"({match.group(1)})"
+    match = re.search(r'([A-Z])', ans)
     if match:
         return f"({match.group(1)})"
     return f"({ans})"
 def bbh_freeform_postprocess(text: str) -> str:
     ans = text
+    ans_line = ans.split('answer is ')
     if len(ans_line) != 1:
         ans = ans_line[-1].strip()
+    ans = ans.split('\n')[0]
+    if ans.endswith('.'):
         ans = ans[:-1]
     return ans
 icl_prompts = {
+    'temporal_sequences': """Task description: Answer questions about which times certain events could have occurred.
 Q: Today, Emily went to the museum. Between what times could they have gone?
 We know that:
 3pm-4pm: free.
 The beach closure time: 4pm.
 The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
+    'disambiguation_qa': """Clarify the meaning of sentences with ambiguous pronouns.
 Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
 Sentence: The chief told the counselor that they took the day off.
 Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
 Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
 Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
+    'date_understanding': """Infer the date from context.
 Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
 Options:
 (F) 12/03/1960
 A: Let's think step by step.
 If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
+    'tracking_shuffled_objects_three_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
 As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
 At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
+    'penguins_in_a_table': """Answer questions about a table of penguins and their attributes.
 Q: Here is a table where the first line is a header and each subsequent line is a penguin:  name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15  For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm.  We now add a penguin to the table:
 James, 12, 90, 12
 When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
 The name of the second penguin sorted by alphabetical order is Gwen.
 The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
+    'geometric_shapes': """Name geometric shapes from their SVG paths.
 Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
 Options:
 Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
 Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
 Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
+    'snarks': """Determine which of two sentences is sarcastic.
 According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
 If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
 If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
 Above the above, the sarcastic option is (A). So the answer is (A).""",
+    'ruin_names': """Select the humorous edit that 'ruins' the input movie or musical artist name.
 Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
 Options:
 (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
 (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
 Above the above, the only humorous edit is (B). So the answer is (B).""",
+    'tracking_shuffled_objects_seven_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
 As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
 At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
+    'tracking_shuffled_objects_five_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
 As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
 (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
 At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
+    'logical_deduction_three_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
 Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
 Options:
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
 The white book is the leftmost. So the answer is (C).""",
+    'hyperbaton': """Order adjectives correctly in English sentences.
 Q: Which sentence has the correct adjective order:
 Options:
 When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
 Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
 Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
+    'logical_deduction_five_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
 Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
 Options:
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
 The white book is the leftmost. So the answer is (C).""",
+    'logical_deduction_seven_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
 Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
 Options:
 (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
 According to this ordering, the leftmost book is the white book.
 The white book is the leftmost. So the answer is (C).""",
+    'movie_recommendation': """Recommend movies similar to the given list of movies.
 Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
 Options:
 - Inside Out (animation, family, comedy; 2015)
 - Forrest Gump (comedy, drama, romance; 1994)
 These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
+    'salient_translation_error_detection': """Detect the type of error in an English translation of a German source sentence.
 Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error.  Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
 Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
 (F) Facts
 A: Let's think step by step.
 We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
+    'reasoning_about_colored_objects': """Answer extremely simple questions about the colors of objects on a surface.
 Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
 Options:
 According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
 The teal plate is the first item, namely (1). There is no item to the left of the teal item.
 The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
+    'multistep_arithmetic_two': """Solve multi-step arithmetic problems.
 Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
 A: Let's think step by step.
 Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
 Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
 Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
+    'navigate': """Given a series of navigation instructions, determine whether one would end up back at the starting point.
 Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
 Options:
 (3) Take 9 steps left: (0, -7), facing the positive y-axis.
 (4) Take 7 steps right: (0, 7), facing the positive y-axis.
 Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
+    'dyck_languages': """Correctly close a Dyck-n word.
 Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
 A: Let's think step by step.
 Now, we have reached the end. The final stack is "< [ {".
 We will need to pop out "{", "[", "<" one by one in that order.
 So, we need "}", "]", ">". So the answer is } ] >.""",
+    'word_sorting': """Sort a list of words.
 Q: Sort the following words alphabetically: List: oven costume counterpart
 A: Let's think step by step.
 A: Let's think step by step.
 The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
 The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
+    'sports_understanding': """Determine whether an artificially constructed sentence relating to sports is plausible or not.
 Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
 A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
 Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
 A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
+    'boolean_expressions': """Evaluate the result of a random Boolean expression.
 Q: not ( ( not not True ) ) is
 A: Let's think step by step.
 We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
 Let's evaluate A: A = not ( False ) = not False = True.
 Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
+    'object_counting': """Questions that involve enumerating objects and asking the model to count them.
 Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
 A: Let's think step by step.
 - garlic (1)
 - yam (1)
 Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
+    'formal_fallacies': """Distinguish deductively valid arguments from formal fallacies.
 Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
 Is the argument, given the explicitly stated premises, deductively valid or invalid?
 The case X = rare-consumer(Nioxin) does not appear in (2).
 The case X = loyal-buyer(Caress) does not appear in (2), either.
 So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
+    'causal_judgement': """Answer questions about causal attribution.
 Q: How would a typical person answer each of the following questions about causation?
 Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
 - No
 A: Let's think step by step.
 Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
+    'web_of_lies': """Evaluate a random boolean function expressed as a word problem.
 Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
 A: Let's think step by step.