JeffYang52415
commited on
Commit
•
0450c4e
1
Parent(s):
a6c5f53
refactor: remove system prompt
Browse files- .gitignore +4 -0
- llmdataparser/base_parser.py +2 -8
- llmdataparser/bbh_parser.py +5 -8
- llmdataparser/gsm8k_parser.py +5 -7
- llmdataparser/humaneval_parser.py +7 -13
- llmdataparser/ifeval_parser.py +5 -8
- llmdataparser/math_parser.py +5 -7
- llmdataparser/mbpp_parser.py +4 -7
- llmdataparser/mgsm_parser.py +6 -9
- llmdataparser/mmlu_parser.py +12 -16
- llmdataparser/prompts.py +16 -115
- llmdataparser/tmlu_parser.py +5 -7
- llmdataparser/tw_legal_parser.py +5 -7
- tests/test_bbh_parser.py +3 -5
- tests/test_gsm8k_parser.py +2 -4
- tests/test_humaneval_parser.py +7 -12
- tests/test_ifeval_parser.py +2 -2
- tests/test_math_parser.py +3 -6
- tests/test_mbpp_parser.py +3 -17
- tests/test_mgsm_parser.py +2 -18
- tests/test_mmlu_parser.py +9 -9
- tests/test_tmlu_parser.py +3 -23
- tests/test_tw_legal_parser.py +7 -25
.gitignore
CHANGED
@@ -38,3 +38,7 @@ dist/
|
|
38 |
#notebook cache
|
39 |
.ipynb_checkpoints/
|
40 |
notebooks/
|
|
|
|
|
|
|
|
|
|
38 |
#notebook cache
|
39 |
.ipynb_checkpoints/
|
40 |
notebooks/
|
41 |
+
|
42 |
+
#coverage
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
llmdataparser/base_parser.py
CHANGED
@@ -25,7 +25,7 @@ VALID_CATEGORIES = {
|
|
25 |
class ParseEntry:
|
26 |
"""A simple base class for entries, customizable by each dataset parser."""
|
27 |
|
28 |
-
|
29 |
answer: str
|
30 |
raw_question: str
|
31 |
raw_answer: str
|
@@ -166,18 +166,14 @@ class HuggingFaceDatasetParser(DatasetParser[T]):
|
|
166 |
_task_names: ClassVar[list[str]]
|
167 |
# _default_task is the default task to use if no task is specified, e.g. "algebra"
|
168 |
_default_task: ClassVar[str]
|
169 |
-
# _default_system_prompt is the default system prompt to use if no system prompt is specified
|
170 |
-
_default_system_prompt: ClassVar[str]
|
171 |
# _hidden_task_names is the list of task names that are hidden in the dataset, e.g. ["math", "physics", "chemistry"]
|
172 |
_hidden_task_names: ClassVar[list[str]] = []
|
173 |
|
174 |
-
def __init__(self,
|
175 |
"""
|
176 |
Initialize a HuggingFaceDatasetParser.
|
177 |
|
178 |
Args:
|
179 |
-
system_prompt: Optional custom system prompt to use instead of the default.
|
180 |
-
If not provided, will use the class's _default_system_prompt.
|
181 |
**kwargs: Additional keyword arguments passed to the parent class.
|
182 |
"""
|
183 |
super().__init__()
|
@@ -187,8 +183,6 @@ class HuggingFaceDatasetParser(DatasetParser[T]):
|
|
187 |
self.split_names: list[str] = []
|
188 |
# _current_task is the task currently being processed, e.g. "algebra"
|
189 |
self._current_task: str = ""
|
190 |
-
# _system_prompt is the system prompt currently being used
|
191 |
-
self._system_prompt: str = system_prompt or self._default_system_prompt
|
192 |
|
193 |
def _get_current_task(self, data_entry: dict[str, Any] | None = None) -> str:
|
194 |
"""
|
|
|
25 |
class ParseEntry:
|
26 |
"""A simple base class for entries, customizable by each dataset parser."""
|
27 |
|
28 |
+
question: str
|
29 |
answer: str
|
30 |
raw_question: str
|
31 |
raw_answer: str
|
|
|
166 |
_task_names: ClassVar[list[str]]
|
167 |
# _default_task is the default task to use if no task is specified, e.g. "algebra"
|
168 |
_default_task: ClassVar[str]
|
|
|
|
|
169 |
# _hidden_task_names is the list of task names that are hidden in the dataset, e.g. ["math", "physics", "chemistry"]
|
170 |
_hidden_task_names: ClassVar[list[str]] = []
|
171 |
|
172 |
+
def __init__(self, **kwargs: Any) -> None:
|
173 |
"""
|
174 |
Initialize a HuggingFaceDatasetParser.
|
175 |
|
176 |
Args:
|
|
|
|
|
177 |
**kwargs: Additional keyword arguments passed to the parent class.
|
178 |
"""
|
179 |
super().__init__()
|
|
|
183 |
self.split_names: list[str] = []
|
184 |
# _current_task is the task currently being processed, e.g. "algebra"
|
185 |
self._current_task: str = ""
|
|
|
|
|
186 |
|
187 |
def _get_current_task(self, data_entry: dict[str, Any] | None = None) -> str:
|
188 |
"""
|
llmdataparser/bbh_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import BBH_SYSTEM_PROMPT # You'll need to create this
|
11 |
|
12 |
|
13 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
@@ -17,14 +16,14 @@ class BBHParseEntry(HuggingFaceParseEntry):
|
|
17 |
@classmethod
|
18 |
def create(
|
19 |
cls,
|
20 |
-
|
21 |
answer: str,
|
22 |
raw_question: str,
|
23 |
raw_answer: str,
|
24 |
task_name: str,
|
25 |
) -> "BBHParseEntry":
|
26 |
return cls(
|
27 |
-
|
28 |
answer=answer,
|
29 |
raw_question=raw_question,
|
30 |
raw_answer=raw_answer,
|
@@ -66,7 +65,6 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
|
|
66 |
"word_sorting",
|
67 |
]
|
68 |
_default_task: ClassVar[str] = "reasoning_about_colored_objects"
|
69 |
-
_default_system_prompt: ClassVar[str] = BBH_SYSTEM_PROMPT
|
70 |
|
71 |
def process_entry(
|
72 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -78,14 +76,13 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
|
|
78 |
# Remove parentheses from the answer
|
79 |
clean_answer = raw_answer.strip("()")
|
80 |
|
81 |
-
|
82 |
-
prompt = f"{self._system_prompt}\n\n{raw_question}"
|
83 |
|
84 |
# Use task_name if provided, otherwise use default
|
85 |
task = task_name or self._get_current_task(row)
|
86 |
|
87 |
return BBHParseEntry.create(
|
88 |
-
|
89 |
answer=clean_answer,
|
90 |
raw_question=raw_question,
|
91 |
raw_answer=raw_answer,
|
@@ -176,5 +173,5 @@ if __name__ == "__main__":
|
|
176 |
example = parsed_data[0]
|
177 |
print("\nExample parsed entry:")
|
178 |
print(f"Task: {example.task_name}")
|
179 |
-
print(f"Question: {example.
|
180 |
print(f"Answer: {example.answer}")
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
|
12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
|
|
16 |
@classmethod
|
17 |
def create(
|
18 |
cls,
|
19 |
+
question: str,
|
20 |
answer: str,
|
21 |
raw_question: str,
|
22 |
raw_answer: str,
|
23 |
task_name: str,
|
24 |
) -> "BBHParseEntry":
|
25 |
return cls(
|
26 |
+
question=question,
|
27 |
answer=answer,
|
28 |
raw_question=raw_question,
|
29 |
raw_answer=raw_answer,
|
|
|
65 |
"word_sorting",
|
66 |
]
|
67 |
_default_task: ClassVar[str] = "reasoning_about_colored_objects"
|
|
|
68 |
|
69 |
def process_entry(
|
70 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
76 |
# Remove parentheses from the answer
|
77 |
clean_answer = raw_answer.strip("()")
|
78 |
|
79 |
+
question = str(raw_question)
|
|
|
80 |
|
81 |
# Use task_name if provided, otherwise use default
|
82 |
task = task_name or self._get_current_task(row)
|
83 |
|
84 |
return BBHParseEntry.create(
|
85 |
+
question=question,
|
86 |
answer=clean_answer,
|
87 |
raw_question=raw_question,
|
88 |
raw_answer=raw_answer,
|
|
|
173 |
example = parsed_data[0]
|
174 |
print("\nExample parsed entry:")
|
175 |
print(f"Task: {example.task_name}")
|
176 |
+
print(f"Question: {example.question}")
|
177 |
print(f"Answer: {example.answer}")
|
llmdataparser/gsm8k_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import GSM8K_SYSTEM_PROMPT
|
11 |
|
12 |
|
13 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
@@ -21,7 +20,7 @@ class GSM8KParseEntry(HuggingFaceParseEntry):
|
|
21 |
@classmethod
|
22 |
def create(
|
23 |
cls,
|
24 |
-
|
25 |
answer: str,
|
26 |
raw_question: str,
|
27 |
raw_answer: str,
|
@@ -30,7 +29,7 @@ class GSM8KParseEntry(HuggingFaceParseEntry):
|
|
30 |
task_name: str,
|
31 |
) -> "GSM8KParseEntry":
|
32 |
return cls(
|
33 |
-
|
34 |
answer=answer,
|
35 |
raw_question=raw_question,
|
36 |
raw_answer=raw_answer,
|
@@ -46,7 +45,6 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
|
|
46 |
_data_source: ClassVar[str] = "openai/gsm8k"
|
47 |
_task_names: ClassVar[list[str]] = ["main", "socratic"]
|
48 |
_default_task: ClassVar[str] = "main"
|
49 |
-
_default_system_prompt: ClassVar[str] = GSM8K_SYSTEM_PROMPT
|
50 |
|
51 |
def process_entry(
|
52 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -69,10 +67,10 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
|
|
69 |
# Extract solution (everything before '####')
|
70 |
solution = raw_answer.split("####")[0].strip()
|
71 |
|
72 |
-
|
73 |
|
74 |
return GSM8KParseEntry.create(
|
75 |
-
|
76 |
answer=str(numerical_answer),
|
77 |
raw_question=raw_question,
|
78 |
raw_answer=raw_answer,
|
@@ -145,7 +143,7 @@ if __name__ == "__main__":
|
|
145 |
parser.parse()
|
146 |
|
147 |
parsed_data = parser.get_parsed_data
|
148 |
-
pprint(parsed_data[0].
|
149 |
pprint(parsed_data[0].answer)
|
150 |
pprint(parsed_data[0].raw_question)
|
151 |
pprint(parsed_data[0].raw_answer)
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
|
12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
|
|
20 |
@classmethod
|
21 |
def create(
|
22 |
cls,
|
23 |
+
question: str,
|
24 |
answer: str,
|
25 |
raw_question: str,
|
26 |
raw_answer: str,
|
|
|
29 |
task_name: str,
|
30 |
) -> "GSM8KParseEntry":
|
31 |
return cls(
|
32 |
+
question=question,
|
33 |
answer=answer,
|
34 |
raw_question=raw_question,
|
35 |
raw_answer=raw_answer,
|
|
|
45 |
_data_source: ClassVar[str] = "openai/gsm8k"
|
46 |
_task_names: ClassVar[list[str]] = ["main", "socratic"]
|
47 |
_default_task: ClassVar[str] = "main"
|
|
|
48 |
|
49 |
def process_entry(
|
50 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
67 |
# Extract solution (everything before '####')
|
68 |
solution = raw_answer.split("####")[0].strip()
|
69 |
|
70 |
+
question = str(raw_question)
|
71 |
|
72 |
return GSM8KParseEntry.create(
|
73 |
+
question=question,
|
74 |
answer=str(numerical_answer),
|
75 |
raw_question=raw_question,
|
76 |
raw_answer=raw_answer,
|
|
|
143 |
parser.parse()
|
144 |
|
145 |
parsed_data = parser.get_parsed_data
|
146 |
+
pprint(parsed_data[0].question)
|
147 |
pprint(parsed_data[0].answer)
|
148 |
pprint(parsed_data[0].raw_question)
|
149 |
pprint(parsed_data[0].raw_answer)
|
llmdataparser/humaneval_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import HUMANEVAL_SYSTEM_PROMPT
|
11 |
|
12 |
|
13 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
@@ -22,7 +21,7 @@ class HumanEvalParseEntry(HuggingFaceParseEntry):
|
|
22 |
@classmethod
|
23 |
def create(
|
24 |
cls,
|
25 |
-
|
26 |
answer: str,
|
27 |
raw_question: str,
|
28 |
task_id: str,
|
@@ -35,7 +34,7 @@ class HumanEvalParseEntry(HuggingFaceParseEntry):
|
|
35 |
if not entry_point:
|
36 |
raise ValueError("Entry point cannot be empty")
|
37 |
return cls(
|
38 |
-
|
39 |
answer=answer,
|
40 |
raw_question=raw_question,
|
41 |
raw_answer=answer, # In HumanEval, the canonical solution is the raw answer
|
@@ -52,7 +51,6 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
|
|
52 |
_data_source: ClassVar[str] = "openai/openai_humaneval"
|
53 |
_default_task: ClassVar[str] = "openai_humaneval"
|
54 |
_task_names: ClassVar[list[str]] = ["openai_humaneval"]
|
55 |
-
_default_system_prompt: ClassVar[str] = HUMANEVAL_SYSTEM_PROMPT
|
56 |
|
57 |
def process_entry(
|
58 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -64,14 +62,13 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
|
|
64 |
entry_point = row["entry_point"]
|
65 |
test = row["test"]
|
66 |
|
67 |
-
|
68 |
-
prompt = f"{self._system_prompt}\n\n{raw_question}"
|
69 |
|
70 |
# Use task_name if provided, otherwise use default
|
71 |
task = task_name or self._get_current_task(row)
|
72 |
|
73 |
return HumanEvalParseEntry.create(
|
74 |
-
|
75 |
answer=answer,
|
76 |
raw_question=raw_question,
|
77 |
task_id=task_id,
|
@@ -151,7 +148,6 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
|
|
151 |
_data_source: ClassVar[str] = "evalplus/humanevalplus"
|
152 |
_default_task: ClassVar[str] = "default"
|
153 |
_task_names: ClassVar[list[str]] = ["default"]
|
154 |
-
_default_system_prompt: ClassVar[str] = HUMANEVAL_SYSTEM_PROMPT
|
155 |
|
156 |
def process_entry(
|
157 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -163,14 +159,12 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
|
|
163 |
entry_point = row["entry_point"]
|
164 |
test = row["test"]
|
165 |
|
166 |
-
|
167 |
-
prompt = f"{self._system_prompt}\n\n{raw_question}"
|
168 |
-
|
169 |
# Use task_name if provided, otherwise use default
|
170 |
task = task_name or self._get_current_task(row)
|
171 |
|
172 |
return HumanEvalParseEntry.create(
|
173 |
-
|
174 |
answer=answer,
|
175 |
raw_question=raw_question,
|
176 |
task_id=task_id,
|
@@ -264,7 +258,7 @@ if __name__ == "__main__":
|
|
264 |
print("\nExample parsed entry:")
|
265 |
print(f"Task ID: {example.task_id}")
|
266 |
print(f"Entry Point: {example.entry_point}")
|
267 |
-
print(f"
|
268 |
print(f"Solution:\n{example.answer}")
|
269 |
|
270 |
parser = HumanEvalDatasetPlusParser()
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
|
12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
|
|
21 |
@classmethod
|
22 |
def create(
|
23 |
cls,
|
24 |
+
question: str,
|
25 |
answer: str,
|
26 |
raw_question: str,
|
27 |
task_id: str,
|
|
|
34 |
if not entry_point:
|
35 |
raise ValueError("Entry point cannot be empty")
|
36 |
return cls(
|
37 |
+
question=question,
|
38 |
answer=answer,
|
39 |
raw_question=raw_question,
|
40 |
raw_answer=answer, # In HumanEval, the canonical solution is the raw answer
|
|
|
51 |
_data_source: ClassVar[str] = "openai/openai_humaneval"
|
52 |
_default_task: ClassVar[str] = "openai_humaneval"
|
53 |
_task_names: ClassVar[list[str]] = ["openai_humaneval"]
|
|
|
54 |
|
55 |
def process_entry(
|
56 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
62 |
entry_point = row["entry_point"]
|
63 |
test = row["test"]
|
64 |
|
65 |
+
question = str(raw_question)
|
|
|
66 |
|
67 |
# Use task_name if provided, otherwise use default
|
68 |
task = task_name or self._get_current_task(row)
|
69 |
|
70 |
return HumanEvalParseEntry.create(
|
71 |
+
question=question,
|
72 |
answer=answer,
|
73 |
raw_question=raw_question,
|
74 |
task_id=task_id,
|
|
|
148 |
_data_source: ClassVar[str] = "evalplus/humanevalplus"
|
149 |
_default_task: ClassVar[str] = "default"
|
150 |
_task_names: ClassVar[list[str]] = ["default"]
|
|
|
151 |
|
152 |
def process_entry(
|
153 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
159 |
entry_point = row["entry_point"]
|
160 |
test = row["test"]
|
161 |
|
162 |
+
question = str(raw_question)
|
|
|
|
|
163 |
# Use task_name if provided, otherwise use default
|
164 |
task = task_name or self._get_current_task(row)
|
165 |
|
166 |
return HumanEvalParseEntry.create(
|
167 |
+
question=question,
|
168 |
answer=answer,
|
169 |
raw_question=raw_question,
|
170 |
task_id=task_id,
|
|
|
258 |
print("\nExample parsed entry:")
|
259 |
print(f"Task ID: {example.task_id}")
|
260 |
print(f"Entry Point: {example.entry_point}")
|
261 |
+
print(f"Question:\n{example.question}")
|
262 |
print(f"Solution:\n{example.answer}")
|
263 |
|
264 |
parser = HumanEvalDatasetPlusParser()
|
llmdataparser/ifeval_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import IFEVAL_SYSTEM_PROMPT # You'll need to create this
|
11 |
|
12 |
|
13 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
@@ -21,7 +20,7 @@ class IFEvalParseEntry(HuggingFaceParseEntry):
|
|
21 |
@classmethod
|
22 |
def create(
|
23 |
cls,
|
24 |
-
|
25 |
answer: str,
|
26 |
raw_question: str,
|
27 |
raw_answer: str,
|
@@ -31,7 +30,7 @@ class IFEvalParseEntry(HuggingFaceParseEntry):
|
|
31 |
task_name: str,
|
32 |
) -> "IFEvalParseEntry":
|
33 |
return cls(
|
34 |
-
|
35 |
answer=answer,
|
36 |
raw_question=raw_question,
|
37 |
raw_answer=raw_answer,
|
@@ -48,7 +47,6 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
|
48 |
_data_source: ClassVar[str] = "google/IFEval"
|
49 |
_default_task: ClassVar[str] = "default"
|
50 |
_task_names: ClassVar[list[str]] = ["default"]
|
51 |
-
_default_system_prompt: ClassVar[str] = IFEVAL_SYSTEM_PROMPT
|
52 |
|
53 |
def process_entry(
|
54 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -65,14 +63,13 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
|
65 |
answer = ""
|
66 |
raw_answer = ""
|
67 |
|
68 |
-
|
69 |
-
prompt = f"{self._system_prompt}\n\n{raw_question}"
|
70 |
|
71 |
# Use task_name if provided, otherwise use default
|
72 |
task = task_name or self._get_current_task(row)
|
73 |
|
74 |
return IFEvalParseEntry.create(
|
75 |
-
|
76 |
answer=answer,
|
77 |
raw_question=raw_question,
|
78 |
raw_answer=raw_answer,
|
@@ -162,6 +159,6 @@ if __name__ == "__main__":
|
|
162 |
example = parsed_data[0]
|
163 |
print("\nExample parsed entry:")
|
164 |
print(f"Key: {example.key}")
|
165 |
-
print(f"
|
166 |
print(f"Instruction IDs: {example.instruction_id_list}")
|
167 |
print(f"kwargs: {example.kwargs}")
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
|
12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
|
|
20 |
@classmethod
|
21 |
def create(
|
22 |
cls,
|
23 |
+
question: str,
|
24 |
answer: str,
|
25 |
raw_question: str,
|
26 |
raw_answer: str,
|
|
|
30 |
task_name: str,
|
31 |
) -> "IFEvalParseEntry":
|
32 |
return cls(
|
33 |
+
question=question,
|
34 |
answer=answer,
|
35 |
raw_question=raw_question,
|
36 |
raw_answer=raw_answer,
|
|
|
47 |
_data_source: ClassVar[str] = "google/IFEval"
|
48 |
_default_task: ClassVar[str] = "default"
|
49 |
_task_names: ClassVar[list[str]] = ["default"]
|
|
|
50 |
|
51 |
def process_entry(
|
52 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
63 |
answer = ""
|
64 |
raw_answer = ""
|
65 |
|
66 |
+
question = str(raw_question)
|
|
|
67 |
|
68 |
# Use task_name if provided, otherwise use default
|
69 |
task = task_name or self._get_current_task(row)
|
70 |
|
71 |
return IFEvalParseEntry.create(
|
72 |
+
question=question,
|
73 |
answer=answer,
|
74 |
raw_question=raw_question,
|
75 |
raw_answer=raw_answer,
|
|
|
159 |
example = parsed_data[0]
|
160 |
print("\nExample parsed entry:")
|
161 |
print(f"Key: {example.key}")
|
162 |
+
print(f"Question: {example.question}")
|
163 |
print(f"Instruction IDs: {example.instruction_id_list}")
|
164 |
print(f"kwargs: {example.kwargs}")
|
llmdataparser/math_parser.py
CHANGED
@@ -20,7 +20,7 @@ class MATHParseEntry(HuggingFaceParseEntry):
|
|
20 |
@classmethod
|
21 |
def create(
|
22 |
cls,
|
23 |
-
|
24 |
answer: str,
|
25 |
raw_question: str,
|
26 |
raw_answer: str,
|
@@ -29,7 +29,7 @@ class MATHParseEntry(HuggingFaceParseEntry):
|
|
29 |
solution: str,
|
30 |
) -> "MATHParseEntry":
|
31 |
return cls(
|
32 |
-
|
33 |
answer=answer,
|
34 |
raw_question=raw_question,
|
35 |
raw_answer=raw_answer,
|
@@ -54,9 +54,7 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
|
|
54 |
"all",
|
55 |
]
|
56 |
_default_task: ClassVar[str] = "all"
|
57 |
-
|
58 |
-
"Solve the following mathematics problem step by step:"
|
59 |
-
)
|
60 |
_valid_levels: ClassVar[set[str]] = {
|
61 |
f"Level {i}" for i in range(1, 6)
|
62 |
} # Levels 1-5 are valid
|
@@ -80,7 +78,7 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
|
|
80 |
level = "Unknown"
|
81 |
|
82 |
return MATHParseEntry.create(
|
83 |
-
|
84 |
answer=row["solution"],
|
85 |
raw_question=row["problem"],
|
86 |
raw_answer=row["solution"],
|
@@ -187,5 +185,5 @@ if __name__ == "__main__":
|
|
187 |
print("\nExample parsed entry:")
|
188 |
print(f"Task: {example.task_name}")
|
189 |
print(f"Level: {example.level}")
|
190 |
-
print(f"Question: {example.
|
191 |
print(f"Solution: {example.solution}")
|
|
|
20 |
@classmethod
|
21 |
def create(
|
22 |
cls,
|
23 |
+
question: str,
|
24 |
answer: str,
|
25 |
raw_question: str,
|
26 |
raw_answer: str,
|
|
|
29 |
solution: str,
|
30 |
) -> "MATHParseEntry":
|
31 |
return cls(
|
32 |
+
question=question,
|
33 |
answer=answer,
|
34 |
raw_question=raw_question,
|
35 |
raw_answer=raw_answer,
|
|
|
54 |
"all",
|
55 |
]
|
56 |
_default_task: ClassVar[str] = "all"
|
57 |
+
|
|
|
|
|
58 |
_valid_levels: ClassVar[set[str]] = {
|
59 |
f"Level {i}" for i in range(1, 6)
|
60 |
} # Levels 1-5 are valid
|
|
|
78 |
level = "Unknown"
|
79 |
|
80 |
return MATHParseEntry.create(
|
81 |
+
question=str(row["problem"]),
|
82 |
answer=row["solution"],
|
83 |
raw_question=row["problem"],
|
84 |
raw_answer=row["solution"],
|
|
|
185 |
print("\nExample parsed entry:")
|
186 |
print(f"Task: {example.task_name}")
|
187 |
print(f"Level: {example.level}")
|
188 |
+
print(f"Question: {example.question}")
|
189 |
print(f"Solution: {example.solution}")
|
llmdataparser/mbpp_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import MBPP_SYSTEM_PROMPT
|
11 |
|
12 |
|
13 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
@@ -23,7 +22,7 @@ class MBPPParseEntry(HuggingFaceParseEntry):
|
|
23 |
@classmethod
|
24 |
def create(
|
25 |
cls,
|
26 |
-
|
27 |
answer: str,
|
28 |
raw_question: str,
|
29 |
task_id: int,
|
@@ -37,7 +36,7 @@ class MBPPParseEntry(HuggingFaceParseEntry):
|
|
37 |
raise ValueError("Task ID must be an integer")
|
38 |
|
39 |
return cls(
|
40 |
-
|
41 |
answer=answer,
|
42 |
raw_question=raw_question,
|
43 |
raw_answer=answer, # In MBPP, the code solution is the raw answer
|
@@ -56,7 +55,6 @@ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
|
|
56 |
_data_source: ClassVar[str] = "google-research-datasets/mbpp"
|
57 |
_default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
|
58 |
_task_names: ClassVar[list[str]] = ["full", "sanitized"]
|
59 |
-
_default_system_prompt: ClassVar[str] = MBPP_SYSTEM_PROMPT
|
60 |
|
61 |
def process_entry(
|
62 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -69,15 +67,14 @@ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
|
|
69 |
test_setup_code = row.get("test_setup_code", "")
|
70 |
challenge_test_list = row.get("challenge_test_list", [])
|
71 |
|
72 |
-
|
73 |
-
prompt = f"{self._system_prompt}\n\nTask: {raw_question}"
|
74 |
|
75 |
# Use task_name if provided, otherwise use default
|
76 |
task = task_name or self._get_current_task(row)
|
77 |
source_file = row.get("source_file", "")
|
78 |
|
79 |
return MBPPParseEntry.create(
|
80 |
-
|
81 |
answer=answer,
|
82 |
raw_question=raw_question,
|
83 |
task_id=task_id,
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
|
12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
|
|
22 |
@classmethod
|
23 |
def create(
|
24 |
cls,
|
25 |
+
question: str,
|
26 |
answer: str,
|
27 |
raw_question: str,
|
28 |
task_id: int,
|
|
|
36 |
raise ValueError("Task ID must be an integer")
|
37 |
|
38 |
return cls(
|
39 |
+
question=question,
|
40 |
answer=answer,
|
41 |
raw_question=raw_question,
|
42 |
raw_answer=answer, # In MBPP, the code solution is the raw answer
|
|
|
55 |
_data_source: ClassVar[str] = "google-research-datasets/mbpp"
|
56 |
_default_task: ClassVar[str] = "full" # Can be 'full' or 'sanitized'
|
57 |
_task_names: ClassVar[list[str]] = ["full", "sanitized"]
|
|
|
58 |
|
59 |
def process_entry(
|
60 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
67 |
test_setup_code = row.get("test_setup_code", "")
|
68 |
challenge_test_list = row.get("challenge_test_list", [])
|
69 |
|
70 |
+
question = str(raw_question)
|
|
|
71 |
|
72 |
# Use task_name if provided, otherwise use default
|
73 |
task = task_name or self._get_current_task(row)
|
74 |
source_file = row.get("source_file", "")
|
75 |
|
76 |
return MBPPParseEntry.create(
|
77 |
+
question=question,
|
78 |
answer=answer,
|
79 |
raw_question=raw_question,
|
80 |
task_id=task_id,
|
llmdataparser/mgsm_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import MGSM_SYSTEM_PROMPT
|
11 |
|
12 |
|
13 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
@@ -21,7 +20,7 @@ class MGSMParseEntry(HuggingFaceParseEntry):
|
|
21 |
@classmethod
|
22 |
def create(
|
23 |
cls,
|
24 |
-
|
25 |
answer: str,
|
26 |
raw_question: str,
|
27 |
raw_answer: str,
|
@@ -31,7 +30,7 @@ class MGSMParseEntry(HuggingFaceParseEntry):
|
|
31 |
language: str,
|
32 |
) -> "MGSMParseEntry":
|
33 |
return cls(
|
34 |
-
|
35 |
answer=answer,
|
36 |
raw_question=raw_question,
|
37 |
raw_answer=raw_answer,
|
@@ -60,7 +59,6 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
|
|
60 |
"th",
|
61 |
"zh",
|
62 |
]
|
63 |
-
_default_system_prompt: ClassVar[str] = MGSM_SYSTEM_PROMPT
|
64 |
|
65 |
def process_entry(
|
66 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -73,7 +71,7 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
|
|
73 |
task_name: Language code for the current task
|
74 |
|
75 |
Returns:
|
76 |
-
MGSMParseEntry: Processed entry with
|
77 |
"""
|
78 |
task = task_name or self._get_current_task(row)
|
79 |
raw_question = row["question"]
|
@@ -81,14 +79,13 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
|
|
81 |
numerical_answer = row["answer_number"]
|
82 |
equation_solution = row["equation_solution"]
|
83 |
|
84 |
-
|
85 |
-
prompt = f"{self._system_prompt}\n{raw_question}"
|
86 |
|
87 |
# Use numerical answer as string for the answer field if no detailed answer is provided
|
88 |
answer = raw_answer if raw_answer else str(numerical_answer)
|
89 |
|
90 |
return MGSMParseEntry.create(
|
91 |
-
|
92 |
answer=answer,
|
93 |
raw_question=raw_question,
|
94 |
raw_answer=raw_answer,
|
@@ -188,7 +185,7 @@ if __name__ == "__main__":
|
|
188 |
parser.parse()
|
189 |
|
190 |
parsed_data = parser.get_parsed_data
|
191 |
-
pprint(parsed_data[0].
|
192 |
pprint(parsed_data[0].answer)
|
193 |
pprint(parsed_data[0].raw_question)
|
194 |
pprint(parsed_data[0].numerical_answer)
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
|
12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
|
|
20 |
@classmethod
|
21 |
def create(
|
22 |
cls,
|
23 |
+
question: str,
|
24 |
answer: str,
|
25 |
raw_question: str,
|
26 |
raw_answer: str,
|
|
|
30 |
language: str,
|
31 |
) -> "MGSMParseEntry":
|
32 |
return cls(
|
33 |
+
question=question,
|
34 |
answer=answer,
|
35 |
raw_question=raw_question,
|
36 |
raw_answer=raw_answer,
|
|
|
59 |
"th",
|
60 |
"zh",
|
61 |
]
|
|
|
62 |
|
63 |
def process_entry(
|
64 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
71 |
task_name: Language code for the current task
|
72 |
|
73 |
Returns:
|
74 |
+
MGSMParseEntry: Processed entry with question, answer, and metadata
|
75 |
"""
|
76 |
task = task_name or self._get_current_task(row)
|
77 |
raw_question = row["question"]
|
|
|
79 |
numerical_answer = row["answer_number"]
|
80 |
equation_solution = row["equation_solution"]
|
81 |
|
82 |
+
question = str(raw_question)
|
|
|
83 |
|
84 |
# Use numerical answer as string for the answer field if no detailed answer is provided
|
85 |
answer = raw_answer if raw_answer else str(numerical_answer)
|
86 |
|
87 |
return MGSMParseEntry.create(
|
88 |
+
question=question,
|
89 |
answer=answer,
|
90 |
raw_question=raw_question,
|
91 |
raw_answer=raw_answer,
|
|
|
185 |
parser.parse()
|
186 |
|
187 |
parsed_data = parser.get_parsed_data
|
188 |
+
pprint(parsed_data[0].question)
|
189 |
pprint(parsed_data[0].answer)
|
190 |
pprint(parsed_data[0].raw_question)
|
191 |
pprint(parsed_data[0].numerical_answer)
|
llmdataparser/mmlu_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import MMLU_PRO_SYSTEM_PROMPT, MMLU_SYSTEM_PROMPT
|
11 |
|
12 |
MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
13 |
MMLU_PRO_VALID_ANSWERS: Final[set[str]] = {
|
@@ -36,7 +35,7 @@ class MMLUParseEntry(HuggingFaceParseEntry):
|
|
36 |
@classmethod
|
37 |
def create(
|
38 |
cls,
|
39 |
-
|
40 |
answer: str,
|
41 |
raw_question: str,
|
42 |
raw_choices: list[str],
|
@@ -50,7 +49,7 @@ class MMLUParseEntry(HuggingFaceParseEntry):
|
|
50 |
if not task_name:
|
51 |
raise ValueError("Task name cannot be empty")
|
52 |
return cls(
|
53 |
-
|
54 |
answer=answer,
|
55 |
raw_question=raw_question,
|
56 |
raw_answer=raw_answer,
|
@@ -69,7 +68,7 @@ class MMLUProParseEntry(HuggingFaceParseEntry):
|
|
69 |
@classmethod
|
70 |
def create(
|
71 |
cls,
|
72 |
-
|
73 |
answer: str,
|
74 |
raw_question: str,
|
75 |
raw_choices: list[str],
|
@@ -83,7 +82,7 @@ class MMLUProParseEntry(HuggingFaceParseEntry):
|
|
83 |
if not task_name:
|
84 |
raise ValueError("Task name cannot be empty")
|
85 |
return cls(
|
86 |
-
|
87 |
answer=answer,
|
88 |
raw_question=raw_question,
|
89 |
raw_choices=raw_choices,
|
@@ -95,8 +94,6 @@ class MMLUProParseEntry(HuggingFaceParseEntry):
|
|
95 |
class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
|
96 |
"""Base class for MMLU dataset parsers with common functionality."""
|
97 |
|
98 |
-
_default_system_prompt = MMLU_SYSTEM_PROMPT
|
99 |
-
|
100 |
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
|
101 |
"""Get the task name from the data entry or default task name."""
|
102 |
task_name: str = data_entry.get("subject", "")
|
@@ -106,7 +103,7 @@ class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
|
|
106 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
107 |
) -> MMLUParseEntry:
|
108 |
"""
|
109 |
-
Generate a
|
110 |
|
111 |
Args:
|
112 |
row: A data point to be formatted.
|
@@ -127,11 +124,11 @@ class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
|
|
127 |
raw_choices = row["choices"]
|
128 |
raw_answer = str(row["answer"]) # Ensure raw_answer is a string
|
129 |
|
130 |
-
|
131 |
answer_letter = chr(65 + int(raw_answer)) # Convert index to 'A', 'B', 'C', 'D'
|
132 |
|
133 |
return MMLUParseEntry.create(
|
134 |
-
|
135 |
answer=answer_letter,
|
136 |
raw_question=raw_question,
|
137 |
raw_choices=raw_choices,
|
@@ -482,11 +479,11 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
|
482 |
raw_question = row["question"]
|
483 |
raw_answer = row["answer"]
|
484 |
|
485 |
-
|
486 |
task = task_name or self._get_current_task(row)
|
487 |
|
488 |
return MMLUParseEntry.create(
|
489 |
-
|
490 |
)
|
491 |
|
492 |
def get_dataset_description(self) -> DatasetDescription:
|
@@ -572,7 +569,6 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
|
572 |
"computer_science",
|
573 |
"history",
|
574 |
]
|
575 |
-
_default_system_prompt = MMLU_PRO_SYSTEM_PROMPT
|
576 |
|
577 |
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
|
578 |
"""Get the task name from the data entry or default task name."""
|
@@ -586,7 +582,7 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
|
586 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
587 |
) -> MMLUProParseEntry:
|
588 |
"""
|
589 |
-
Generate a
|
590 |
|
591 |
Args:
|
592 |
row (dict[str, Any]): A data point to be formatted with MMLU Pro specific structure
|
@@ -608,13 +604,13 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
|
608 |
raw_answer = row["answer"]
|
609 |
answer_index = row["answer_index"]
|
610 |
|
611 |
-
|
612 |
answer_letter = chr(
|
613 |
65 + answer_index
|
614 |
) # Convert index to 'A', 'B', 'C', 'D', etc.
|
615 |
|
616 |
return MMLUProParseEntry.create(
|
617 |
-
|
618 |
)
|
619 |
|
620 |
def get_dataset_description(self) -> DatasetDescription:
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
MMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
12 |
MMLU_PRO_VALID_ANSWERS: Final[set[str]] = {
|
|
|
35 |
@classmethod
|
36 |
def create(
|
37 |
cls,
|
38 |
+
question: str,
|
39 |
answer: str,
|
40 |
raw_question: str,
|
41 |
raw_choices: list[str],
|
|
|
49 |
if not task_name:
|
50 |
raise ValueError("Task name cannot be empty")
|
51 |
return cls(
|
52 |
+
question=question,
|
53 |
answer=answer,
|
54 |
raw_question=raw_question,
|
55 |
raw_answer=raw_answer,
|
|
|
68 |
@classmethod
|
69 |
def create(
|
70 |
cls,
|
71 |
+
question: str,
|
72 |
answer: str,
|
73 |
raw_question: str,
|
74 |
raw_choices: list[str],
|
|
|
82 |
if not task_name:
|
83 |
raise ValueError("Task name cannot be empty")
|
84 |
return cls(
|
85 |
+
question=question,
|
86 |
answer=answer,
|
87 |
raw_question=raw_question,
|
88 |
raw_choices=raw_choices,
|
|
|
94 |
class MMLUDatasetParser(HuggingFaceDatasetParser[MMLUParseEntry]):
|
95 |
"""Base class for MMLU dataset parsers with common functionality."""
|
96 |
|
|
|
|
|
97 |
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
|
98 |
"""Get the task name from the data entry or default task name."""
|
99 |
task_name: str = data_entry.get("subject", "")
|
|
|
103 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
104 |
) -> MMLUParseEntry:
|
105 |
"""
|
106 |
+
Generate a question and expected answer from the given row.
|
107 |
|
108 |
Args:
|
109 |
row: A data point to be formatted.
|
|
|
124 |
raw_choices = row["choices"]
|
125 |
raw_answer = str(row["answer"]) # Ensure raw_answer is a string
|
126 |
|
127 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
128 |
answer_letter = chr(65 + int(raw_answer)) # Convert index to 'A', 'B', 'C', 'D'
|
129 |
|
130 |
return MMLUParseEntry.create(
|
131 |
+
question=question,
|
132 |
answer=answer_letter,
|
133 |
raw_question=raw_question,
|
134 |
raw_choices=raw_choices,
|
|
|
479 |
raw_question = row["question"]
|
480 |
raw_answer = row["answer"]
|
481 |
|
482 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
483 |
task = task_name or self._get_current_task(row)
|
484 |
|
485 |
return MMLUParseEntry.create(
|
486 |
+
question, raw_answer, raw_question, raw_choices, raw_answer, task
|
487 |
)
|
488 |
|
489 |
def get_dataset_description(self) -> DatasetDescription:
|
|
|
569 |
"computer_science",
|
570 |
"history",
|
571 |
]
|
|
|
572 |
|
573 |
def _get_task_from_entry(self, data_entry: dict[str, Any]) -> str:
|
574 |
"""Get the task name from the data entry or default task name."""
|
|
|
582 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
583 |
) -> MMLUProParseEntry:
|
584 |
"""
|
585 |
+
Generate a question and expected answer from the given row.
|
586 |
|
587 |
Args:
|
588 |
row (dict[str, Any]): A data point to be formatted with MMLU Pro specific structure
|
|
|
604 |
raw_answer = row["answer"]
|
605 |
answer_index = row["answer_index"]
|
606 |
|
607 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
608 |
answer_letter = chr(
|
609 |
65 + answer_index
|
610 |
) # Convert index to 'A', 'B', 'C', 'D', etc.
|
611 |
|
612 |
return MMLUProParseEntry.create(
|
613 |
+
question, answer_letter, raw_question, raw_choices, raw_answer, final_task
|
614 |
)
|
615 |
|
616 |
def get_dataset_description(self) -> DatasetDescription:
|
llmdataparser/prompts.py
CHANGED
@@ -3,164 +3,65 @@ from typing import Final
|
|
3 |
|
4 |
MMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
5 |
"""\
|
6 |
-
You are
|
7 |
-
|
8 |
-
Instructions:
|
9 |
-
1. Carefully analyze the question and all answer options
|
10 |
-
2. Consider only verified, factual information
|
11 |
-
3. Select the most precise and accurate option
|
12 |
-
4. Respond with ONLY the letter (A, B, C, or D) - no explanations or additional text
|
13 |
"""
|
14 |
)
|
15 |
|
16 |
MMLU_PRO_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
17 |
"""\
|
18 |
-
You are
|
19 |
-
|
20 |
-
Instructions:
|
21 |
-
1. Carefully analyze the question and all answer options
|
22 |
-
2. Consider only verified, factual information
|
23 |
-
3. Select the most precise and accurate option
|
24 |
-
4. Respond with ONLY the letter (A through J) - no explanations or additional text
|
25 |
"""
|
26 |
)
|
27 |
|
28 |
GSM8K_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
29 |
"""\
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
1. Read the problem carefully
|
34 |
-
2. Show your step-by-step reasoning
|
35 |
-
3. Ensure each step is clear and mathematically sound
|
36 |
-
4. End with the final numerical answer
|
37 |
-
5. Format your response as:
|
38 |
-
Let's solve this step by step:
|
39 |
-
1) [First step]
|
40 |
-
2) [Second step]
|
41 |
-
...
|
42 |
-
Therefore, the answer is [number]
|
43 |
"""
|
44 |
)
|
45 |
|
46 |
-
|
47 |
HUMANEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
48 |
"""\
|
49 |
-
|
50 |
-
|
51 |
-
Instructions:
|
52 |
-
1. Read the function signature and docstring carefully
|
53 |
-
2. Implement only the function body, not the signature or docstring
|
54 |
-
3. Follow Python best practices and PEP 8 style guidelines
|
55 |
-
4. Write clear, readable code with appropriate variable names
|
56 |
-
5. Handle edge cases and input validation where necessary
|
57 |
-
6. Use type hints and ensure type safety
|
58 |
-
7. Optimize for both readability and performance
|
59 |
-
8. Add comments for complex logic or non-obvious implementations
|
60 |
-
9. Include appropriate error handling with specific exception types
|
61 |
-
10. Consider writing code that would be easy to test
|
62 |
-
11. Return only the implementation code, no additional text
|
63 |
-
|
64 |
-
Example of good implementation:
|
65 |
-
```python
|
66 |
-
# Handle edge case of empty input
|
67 |
-
if not numbers:
|
68 |
-
raise ValueError("Input list cannot be empty")
|
69 |
-
|
70 |
-
# Use descriptive variable names and type hints
|
71 |
-
result: list[int] = sorted(numbers)
|
72 |
-
return result[len(result) // 2] # Return median value
|
73 |
-
```
|
74 |
"""
|
75 |
)
|
76 |
|
77 |
-
MGSM_SYSTEM_PROMPT = textwrap.dedent(
|
78 |
"""\
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
2. Show your step-by-step reasoning
|
84 |
-
3. Ensure each step is clear and mathematically sound
|
85 |
-
4. Use appropriate number formatting for the target language (e.g., decimal points vs. commas)
|
86 |
-
5. End with the final numerical answer
|
87 |
-
6. Format your response as:
|
88 |
-
Let's solve this step by step:
|
89 |
-
1) [First step]
|
90 |
-
2) [Second step]
|
91 |
-
...
|
92 |
-
Therefore, the answer is [number]
|
93 |
"""
|
94 |
)
|
95 |
|
96 |
-
|
97 |
IFEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
98 |
"""\
|
99 |
-
|
100 |
-
|
101 |
-
Instructions:
|
102 |
-
1. Read all requirements carefully
|
103 |
-
2. Follow formatting rules exactly
|
104 |
-
3. Meet all length requirements
|
105 |
-
4. Include all required elements
|
106 |
-
5. Avoid forbidden elements
|
107 |
-
6. Provide ONLY the requested output
|
108 |
"""
|
109 |
)
|
110 |
|
111 |
BBH_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
112 |
"""\
|
113 |
-
|
114 |
-
|
115 |
-
Instructions:
|
116 |
-
1. Read the entire problem carefully, including all given conditions and rules
|
117 |
-
2. Pay attention to the specific type of reasoning required (logical, temporal, spatial, etc.)
|
118 |
-
3. Consider all relationships and constraints mentioned in the problem
|
119 |
-
4. Apply structured thinking to reach a valid conclusion
|
120 |
-
5. Choose the answer that logically follows from the given information
|
121 |
-
6. Respond with ONLY the letter (A, B, C, etc.) or "True"/"False" or "Yes"/"No" and so on - no explanations or additional text
|
122 |
"""
|
123 |
)
|
124 |
|
125 |
MBPP_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
126 |
"""\
|
127 |
-
|
128 |
-
|
129 |
-
Instructions:
|
130 |
-
1. Read the task description carefully
|
131 |
-
2. Write a complete Python solution that solves the problem
|
132 |
-
3. Follow Python best practices and PEP 8 style guidelines
|
133 |
-
4. Write clear, readable code with descriptive variable names
|
134 |
-
5. Handle edge cases and input validation appropriately
|
135 |
-
6. Include docstrings or comments to explain complex logic
|
136 |
-
7. Focus on fundamental programming concepts and standard library usage
|
137 |
-
8. Optimize for readability and maintainability
|
138 |
-
9. Return only the implementation code, no additional text
|
139 |
"""
|
140 |
)
|
141 |
|
142 |
TW_LEGAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
143 |
"""\
|
144 |
-
|
145 |
-
|
146 |
-
Instructions:
|
147 |
-
1. Carefully analyze the legal question and all options
|
148 |
-
2. Consider Taiwan's specific legal context and terminology
|
149 |
-
3. Apply relevant laws, regulations, and legal principles
|
150 |
-
4. Select the single most accurate answer
|
151 |
-
5. Respond with ONLY the letter (A, B, C, or D) - no explanations or additional text
|
152 |
"""
|
153 |
)
|
154 |
|
155 |
TMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
156 |
"""\
|
157 |
-
|
158 |
-
|
159 |
-
Instructions:
|
160 |
-
1. Carefully read and understand the question
|
161 |
-
2. Consider all answer options thoroughly
|
162 |
-
3. Apply subject-specific knowledge and reasoning
|
163 |
-
4. Select the single most accurate answer
|
164 |
-
5. Respond with ONLY the letter (A, B, C, or D) - no explanations or additional text
|
165 |
"""
|
166 |
)
|
|
|
3 |
|
4 |
MMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
5 |
"""\
|
6 |
+
You are an expert answering multiple-choice questions. Select the single most accurate answer (A, B, C, or D) based on factual knowledge. Respond with the letter only.
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
)
|
9 |
|
10 |
MMLU_PRO_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
11 |
"""\
|
12 |
+
You are an expert answering multiple-choice questions. Select the single most accurate answer (A through J) based on factual knowledge. Respond with the letter only.
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
"""
|
14 |
)
|
15 |
|
16 |
GSM8K_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
17 |
"""\
|
18 |
+
Solve this math problem step by step:
|
19 |
+
1) Show your reasoning
|
20 |
+
2) End with "Therefore, the answer is [number]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"""
|
22 |
)
|
23 |
|
|
|
24 |
HUMANEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
25 |
"""\
|
26 |
+
Implement the Python function following best practices. Include error handling, type hints, and comments for complex logic. Return only the implementation code.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
"""
|
28 |
)
|
29 |
|
30 |
+
MGSM_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
31 |
"""\
|
32 |
+
Solve this math problem step by step in the specified language:
|
33 |
+
1) Show your reasoning
|
34 |
+
2) Use appropriate number formatting
|
35 |
+
3) End with "Therefore, the answer is [number]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
"""
|
37 |
)
|
38 |
|
|
|
39 |
IFEVAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
40 |
"""\
|
41 |
+
Follow the given requirements exactly. Provide only the requested output.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
"""
|
43 |
)
|
44 |
|
45 |
BBH_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
46 |
"""\
|
47 |
+
Solve this reasoning problem and respond with only the answer (letter, True/False, or Yes/No).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
"""
|
49 |
)
|
50 |
|
51 |
MBPP_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
52 |
"""\
|
53 |
+
Write clean, efficient Python code that solves the given task. Include docstrings and handle edge cases. Return only the implementation code.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
"""
|
55 |
)
|
56 |
|
57 |
TW_LEGAL_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
58 |
"""\
|
59 |
+
As a Taiwan legal expert, select the most accurate answer (A, B, C, or D) based on Taiwan's laws. Respond with the letter only.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
"""
|
61 |
)
|
62 |
|
63 |
TMLU_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
|
64 |
"""\
|
65 |
+
Select the most accurate answer (A, B, C, or D) based on Taiwan's educational and professional knowledge. Respond with the letter only.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
"""
|
67 |
)
|
llmdataparser/tmlu_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import TMLU_SYSTEM_PROMPT
|
11 |
|
12 |
TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
13 |
TMLU_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TMLU_VALID_ANSWERS))
|
@@ -24,7 +23,7 @@ class TMLUParseEntry(HuggingFaceParseEntry):
|
|
24 |
@classmethod
|
25 |
def create(
|
26 |
cls,
|
27 |
-
|
28 |
answer: str,
|
29 |
raw_question: str,
|
30 |
raw_choices: list[str],
|
@@ -38,7 +37,7 @@ class TMLUParseEntry(HuggingFaceParseEntry):
|
|
38 |
f"Invalid answer_letter '{answer}'; must be one of {TMLU_VALID_ANSWER_STR}"
|
39 |
)
|
40 |
return cls(
|
41 |
-
|
42 |
answer=answer,
|
43 |
raw_question=raw_question,
|
44 |
raw_answer=raw_answer,
|
@@ -93,7 +92,6 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
|
|
93 |
"teacher_qualification",
|
94 |
"accountant",
|
95 |
]
|
96 |
-
_default_system_prompt = TMLU_SYSTEM_PROMPT
|
97 |
|
98 |
def process_entry(
|
99 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -110,10 +108,10 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
|
|
110 |
explanation = row.get("explanation", "")
|
111 |
metadata = row.get("metadata", {})
|
112 |
|
113 |
-
|
114 |
|
115 |
return TMLUParseEntry.create(
|
116 |
-
|
117 |
answer=raw_answer,
|
118 |
raw_question=raw_question,
|
119 |
raw_choices=raw_choices,
|
@@ -187,7 +185,7 @@ if __name__ == "__main__":
|
|
187 |
example = parsed_data[0]
|
188 |
print("\nExample parsed entry:")
|
189 |
print(f"Task: {example.task_name}")
|
190 |
-
print(f"Question: {example.
|
191 |
print("Choices:")
|
192 |
for i, choice in enumerate(example.raw_choices):
|
193 |
print(f"{chr(65 + i)}. {choice}")
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
TMLU_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
12 |
TMLU_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TMLU_VALID_ANSWERS))
|
|
|
23 |
@classmethod
|
24 |
def create(
|
25 |
cls,
|
26 |
+
question: str,
|
27 |
answer: str,
|
28 |
raw_question: str,
|
29 |
raw_choices: list[str],
|
|
|
37 |
f"Invalid answer_letter '{answer}'; must be one of {TMLU_VALID_ANSWER_STR}"
|
38 |
)
|
39 |
return cls(
|
40 |
+
question=question,
|
41 |
answer=answer,
|
42 |
raw_question=raw_question,
|
43 |
raw_answer=raw_answer,
|
|
|
92 |
"teacher_qualification",
|
93 |
"accountant",
|
94 |
]
|
|
|
95 |
|
96 |
def process_entry(
|
97 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
108 |
explanation = row.get("explanation", "")
|
109 |
metadata = row.get("metadata", {})
|
110 |
|
111 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
112 |
|
113 |
return TMLUParseEntry.create(
|
114 |
+
question=question,
|
115 |
answer=raw_answer,
|
116 |
raw_question=raw_question,
|
117 |
raw_choices=raw_choices,
|
|
|
185 |
example = parsed_data[0]
|
186 |
print("\nExample parsed entry:")
|
187 |
print(f"Task: {example.task_name}")
|
188 |
+
print(f"Question: {example.question}")
|
189 |
print("Choices:")
|
190 |
for i, choice in enumerate(example.raw_choices):
|
191 |
print(f"{chr(65 + i)}. {choice}")
|
llmdataparser/tw_legal_parser.py
CHANGED
@@ -7,7 +7,6 @@ from llmdataparser.base_parser import (
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
10 |
-
from llmdataparser.prompts import TW_LEGAL_SYSTEM_PROMPT
|
11 |
|
12 |
TW_LEGAL_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
13 |
TW_LEGAL_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TW_LEGAL_VALID_ANSWERS))
|
@@ -22,7 +21,7 @@ class TWLegalParseEntry(HuggingFaceParseEntry):
|
|
22 |
@classmethod
|
23 |
def create(
|
24 |
cls,
|
25 |
-
|
26 |
answer: str,
|
27 |
raw_question: str,
|
28 |
raw_choices: list[str],
|
@@ -34,7 +33,7 @@ class TWLegalParseEntry(HuggingFaceParseEntry):
|
|
34 |
f"Invalid answer_letter '{answer}'; must be one of {TW_LEGAL_VALID_ANSWER_STR}"
|
35 |
)
|
36 |
return cls(
|
37 |
-
|
38 |
answer=answer,
|
39 |
raw_question=raw_question,
|
40 |
raw_answer=raw_answer,
|
@@ -49,7 +48,6 @@ class TWLegalDatasetParser(HuggingFaceDatasetParser[TWLegalParseEntry]):
|
|
49 |
_data_source = "lianghsun/tw-legal-benchmark-v1"
|
50 |
_default_task = "default"
|
51 |
_task_names = ["default"]
|
52 |
-
_default_system_prompt = TW_LEGAL_SYSTEM_PROMPT
|
53 |
|
54 |
def process_entry(
|
55 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
@@ -64,10 +62,10 @@ class TWLegalDatasetParser(HuggingFaceDatasetParser[TWLegalParseEntry]):
|
|
64 |
raw_question = row["question"]
|
65 |
raw_answer = row["answer"]
|
66 |
|
67 |
-
|
68 |
|
69 |
return TWLegalParseEntry.create(
|
70 |
-
|
71 |
answer=raw_answer,
|
72 |
raw_question=raw_question,
|
73 |
raw_choices=raw_choices,
|
@@ -119,7 +117,7 @@ if __name__ == "__main__":
|
|
119 |
if parsed_data:
|
120 |
example = parsed_data[0]
|
121 |
print("\nExample parsed entry:")
|
122 |
-
print(f"Question: {example.
|
123 |
print("Choices:")
|
124 |
for i, choice in enumerate(example.raw_choices):
|
125 |
print(f"{chr(65 + i)}. {choice}")
|
|
|
7 |
HuggingFaceDatasetParser,
|
8 |
HuggingFaceParseEntry,
|
9 |
)
|
|
|
10 |
|
11 |
TW_LEGAL_VALID_ANSWERS: Final[set[str]] = {"A", "B", "C", "D"}
|
12 |
TW_LEGAL_VALID_ANSWER_STR: Final[str] = ", ".join(sorted(TW_LEGAL_VALID_ANSWERS))
|
|
|
21 |
@classmethod
|
22 |
def create(
|
23 |
cls,
|
24 |
+
question: str,
|
25 |
answer: str,
|
26 |
raw_question: str,
|
27 |
raw_choices: list[str],
|
|
|
33 |
f"Invalid answer_letter '{answer}'; must be one of {TW_LEGAL_VALID_ANSWER_STR}"
|
34 |
)
|
35 |
return cls(
|
36 |
+
question=question,
|
37 |
answer=answer,
|
38 |
raw_question=raw_question,
|
39 |
raw_answer=raw_answer,
|
|
|
48 |
_data_source = "lianghsun/tw-legal-benchmark-v1"
|
49 |
_default_task = "default"
|
50 |
_task_names = ["default"]
|
|
|
51 |
|
52 |
def process_entry(
|
53 |
self, row: dict[str, Any], task_name: str | None = None, **kwargs: Any
|
|
|
62 |
raw_question = row["question"]
|
63 |
raw_answer = row["answer"]
|
64 |
|
65 |
+
question = f"Question: {raw_question}\n{choices}\nAnswer:"
|
66 |
|
67 |
return TWLegalParseEntry.create(
|
68 |
+
question=question,
|
69 |
answer=raw_answer,
|
70 |
raw_question=raw_question,
|
71 |
raw_choices=raw_choices,
|
|
|
117 |
if parsed_data:
|
118 |
example = parsed_data[0]
|
119 |
print("\nExample parsed entry:")
|
120 |
+
print(f"Question: {example.question}")
|
121 |
print("Choices:")
|
122 |
for i, choice in enumerate(example.raw_choices):
|
123 |
print(f"{chr(65 + i)}. {choice}")
|
tests/test_bbh_parser.py
CHANGED
@@ -28,14 +28,14 @@ def sample_row():
|
|
28 |
def test_bbh_parse_entry_creation_valid():
|
29 |
"""Test valid creation of BBHParseEntry."""
|
30 |
entry = BBHParseEntry.create(
|
31 |
-
|
32 |
answer="A",
|
33 |
raw_question="Test question",
|
34 |
raw_answer="(A)",
|
35 |
task_name="reasoning_about_colored_objects",
|
36 |
)
|
37 |
assert isinstance(entry, BBHParseEntry)
|
38 |
-
assert entry.
|
39 |
assert entry.answer == "A"
|
40 |
assert entry.raw_question == "Test question"
|
41 |
assert entry.raw_answer == "(A)"
|
@@ -76,7 +76,6 @@ def test_full_parse_workflow(loaded_bbh_parser):
|
|
76 |
assert isinstance(first_entry, BBHParseEntry)
|
77 |
assert first_entry.task_name == "reasoning_about_colored_objects"
|
78 |
assert first_entry.answer.strip("()").isalpha() # Should be a single letter
|
79 |
-
assert first_entry.prompt.startswith(loaded_bbh_parser._system_prompt)
|
80 |
|
81 |
|
82 |
def test_process_entry(bbh_parser, sample_row):
|
@@ -87,9 +86,8 @@ def test_process_entry(bbh_parser, sample_row):
|
|
87 |
|
88 |
assert isinstance(entry, BBHParseEntry)
|
89 |
assert entry.answer == "A" # Stripped from "(A)"
|
90 |
-
assert "What color is the sky" in entry.
|
91 |
assert entry.raw_answer == "(A)"
|
92 |
-
assert bbh_parser._system_prompt in entry.prompt
|
93 |
assert entry.task_name == "reasoning_about_colored_objects"
|
94 |
|
95 |
|
|
|
28 |
def test_bbh_parse_entry_creation_valid():
|
29 |
"""Test valid creation of BBHParseEntry."""
|
30 |
entry = BBHParseEntry.create(
|
31 |
+
question="Test question",
|
32 |
answer="A",
|
33 |
raw_question="Test question",
|
34 |
raw_answer="(A)",
|
35 |
task_name="reasoning_about_colored_objects",
|
36 |
)
|
37 |
assert isinstance(entry, BBHParseEntry)
|
38 |
+
assert entry.question == "Test question"
|
39 |
assert entry.answer == "A"
|
40 |
assert entry.raw_question == "Test question"
|
41 |
assert entry.raw_answer == "(A)"
|
|
|
76 |
assert isinstance(first_entry, BBHParseEntry)
|
77 |
assert first_entry.task_name == "reasoning_about_colored_objects"
|
78 |
assert first_entry.answer.strip("()").isalpha() # Should be a single letter
|
|
|
79 |
|
80 |
|
81 |
def test_process_entry(bbh_parser, sample_row):
|
|
|
86 |
|
87 |
assert isinstance(entry, BBHParseEntry)
|
88 |
assert entry.answer == "A" # Stripped from "(A)"
|
89 |
+
assert "What color is the sky" in entry.question
|
90 |
assert entry.raw_answer == "(A)"
|
|
|
91 |
assert entry.task_name == "reasoning_about_colored_objects"
|
92 |
|
93 |
|
tests/test_gsm8k_parser.py
CHANGED
@@ -30,7 +30,7 @@ def sample_row():
|
|
30 |
def test_gsm8k_parse_entry_creation_valid():
|
31 |
"""Test valid creation of GSM8KParseEntry."""
|
32 |
entry = GSM8KParseEntry.create(
|
33 |
-
|
34 |
answer="5",
|
35 |
raw_question="Test question",
|
36 |
raw_answer="Solution steps #### 5",
|
@@ -39,7 +39,7 @@ def test_gsm8k_parse_entry_creation_valid():
|
|
39 |
numerical_answer=5,
|
40 |
)
|
41 |
assert isinstance(entry, GSM8KParseEntry)
|
42 |
-
assert entry.
|
43 |
assert entry.answer == "5"
|
44 |
assert entry.solution == "Solution steps"
|
45 |
assert entry.numerical_answer == 5
|
@@ -83,7 +83,6 @@ def test_full_parse_workflow(loaded_gsm8k_parser):
|
|
83 |
assert isinstance(first_entry.numerical_answer, (str, int, float))
|
84 |
assert "####" in first_entry.raw_answer
|
85 |
assert first_entry.solution
|
86 |
-
assert first_entry.prompt.startswith(loaded_gsm8k_parser._system_prompt)
|
87 |
|
88 |
|
89 |
def test_process_entry(gsm8k_parser, sample_row):
|
@@ -95,7 +94,6 @@ def test_process_entry(gsm8k_parser, sample_row):
|
|
95 |
assert "Janet has 3 apples" in entry.raw_question
|
96 |
assert "#### 5" in entry.raw_answer
|
97 |
assert "Let's solve this step by step:" in entry.solution
|
98 |
-
assert gsm8k_parser._system_prompt in entry.prompt
|
99 |
assert entry.task_name == "main"
|
100 |
|
101 |
|
|
|
30 |
def test_gsm8k_parse_entry_creation_valid():
|
31 |
"""Test valid creation of GSM8KParseEntry."""
|
32 |
entry = GSM8KParseEntry.create(
|
33 |
+
question="Test question",
|
34 |
answer="5",
|
35 |
raw_question="Test question",
|
36 |
raw_answer="Solution steps #### 5",
|
|
|
39 |
numerical_answer=5,
|
40 |
)
|
41 |
assert isinstance(entry, GSM8KParseEntry)
|
42 |
+
assert entry.question == "Test question"
|
43 |
assert entry.answer == "5"
|
44 |
assert entry.solution == "Solution steps"
|
45 |
assert entry.numerical_answer == 5
|
|
|
83 |
assert isinstance(first_entry.numerical_answer, (str, int, float))
|
84 |
assert "####" in first_entry.raw_answer
|
85 |
assert first_entry.solution
|
|
|
86 |
|
87 |
|
88 |
def test_process_entry(gsm8k_parser, sample_row):
|
|
|
94 |
assert "Janet has 3 apples" in entry.raw_question
|
95 |
assert "#### 5" in entry.raw_answer
|
96 |
assert "Let's solve this step by step:" in entry.solution
|
|
|
97 |
assert entry.task_name == "main"
|
98 |
|
99 |
|
tests/test_humaneval_parser.py
CHANGED
@@ -42,7 +42,7 @@ def plus_sample_entry():
|
|
42 |
def test_humaneval_parse_entry_creation():
|
43 |
"""Test creation of HumanEvalParseEntry"""
|
44 |
entry = HumanEvalParseEntry.create(
|
45 |
-
|
46 |
answer="test answer",
|
47 |
raw_question="raw question",
|
48 |
task_id="HumanEval/1",
|
@@ -51,7 +51,7 @@ def test_humaneval_parse_entry_creation():
|
|
51 |
task_name="openai_humaneval",
|
52 |
)
|
53 |
|
54 |
-
assert entry.
|
55 |
assert entry.answer == "test answer"
|
56 |
assert entry.raw_question == "raw question"
|
57 |
assert entry.raw_answer == "test answer" # Should match answer
|
@@ -65,7 +65,7 @@ def test_humaneval_parse_entry_validation():
|
|
65 |
"""Test validation of required fields"""
|
66 |
with pytest.raises(ValueError, match="Task ID cannot be empty"):
|
67 |
HumanEvalParseEntry.create(
|
68 |
-
|
69 |
answer="test",
|
70 |
raw_question="test",
|
71 |
task_id="", # Empty task_id should raise error
|
@@ -76,7 +76,7 @@ def test_humaneval_parse_entry_validation():
|
|
76 |
|
77 |
with pytest.raises(ValueError, match="Entry point cannot be empty"):
|
78 |
HumanEvalParseEntry.create(
|
79 |
-
|
80 |
answer="test",
|
81 |
raw_question="test",
|
82 |
task_id="test",
|
@@ -93,9 +93,7 @@ def test_process_entry(parser, sample_entry):
|
|
93 |
assert isinstance(result, HumanEvalParseEntry)
|
94 |
assert result.task_id == "HumanEval/0"
|
95 |
assert result.entry_point == "add"
|
96 |
-
|
97 |
-
result.prompt == f"{parser._default_system_prompt}\n\n{sample_entry['prompt']}"
|
98 |
-
)
|
99 |
assert result.answer == sample_entry["canonical_solution"]
|
100 |
assert result.test == sample_entry["test"]
|
101 |
assert result.task_name == "openai_humaneval"
|
@@ -147,10 +145,7 @@ def test_plus_process_entry(plus_parser, plus_sample_entry):
|
|
147 |
assert isinstance(result, HumanEvalParseEntry)
|
148 |
assert result.task_id == "HumanEval/0"
|
149 |
assert result.entry_point == "add"
|
150 |
-
|
151 |
-
result.prompt
|
152 |
-
== f"{plus_parser._default_system_prompt}\n\n{plus_sample_entry['prompt']}"
|
153 |
-
)
|
154 |
assert result.answer == plus_sample_entry["canonical_solution"]
|
155 |
assert result.test == plus_sample_entry["test"]
|
156 |
assert result.task_name == "default"
|
@@ -191,7 +186,7 @@ def test_get_dataset_description(parser, plus_parser):
|
|
191 |
assert "evalplus" in plus_description.citation
|
192 |
|
193 |
|
194 |
-
def test_get_evaluation_metrics(parser
|
195 |
"""Test evaluation metrics generation for both parsers."""
|
196 |
# Test original HumanEval metrics
|
197 |
metrics = parser.get_evaluation_metrics()
|
|
|
42 |
def test_humaneval_parse_entry_creation():
|
43 |
"""Test creation of HumanEvalParseEntry"""
|
44 |
entry = HumanEvalParseEntry.create(
|
45 |
+
question="test question",
|
46 |
answer="test answer",
|
47 |
raw_question="raw question",
|
48 |
task_id="HumanEval/1",
|
|
|
51 |
task_name="openai_humaneval",
|
52 |
)
|
53 |
|
54 |
+
assert entry.question == "test question"
|
55 |
assert entry.answer == "test answer"
|
56 |
assert entry.raw_question == "raw question"
|
57 |
assert entry.raw_answer == "test answer" # Should match answer
|
|
|
65 |
"""Test validation of required fields"""
|
66 |
with pytest.raises(ValueError, match="Task ID cannot be empty"):
|
67 |
HumanEvalParseEntry.create(
|
68 |
+
question="test",
|
69 |
answer="test",
|
70 |
raw_question="test",
|
71 |
task_id="", # Empty task_id should raise error
|
|
|
76 |
|
77 |
with pytest.raises(ValueError, match="Entry point cannot be empty"):
|
78 |
HumanEvalParseEntry.create(
|
79 |
+
question="test",
|
80 |
answer="test",
|
81 |
raw_question="test",
|
82 |
task_id="test",
|
|
|
93 |
assert isinstance(result, HumanEvalParseEntry)
|
94 |
assert result.task_id == "HumanEval/0"
|
95 |
assert result.entry_point == "add"
|
96 |
+
|
|
|
|
|
97 |
assert result.answer == sample_entry["canonical_solution"]
|
98 |
assert result.test == sample_entry["test"]
|
99 |
assert result.task_name == "openai_humaneval"
|
|
|
145 |
assert isinstance(result, HumanEvalParseEntry)
|
146 |
assert result.task_id == "HumanEval/0"
|
147 |
assert result.entry_point == "add"
|
148 |
+
|
|
|
|
|
|
|
149 |
assert result.answer == plus_sample_entry["canonical_solution"]
|
150 |
assert result.test == plus_sample_entry["test"]
|
151 |
assert result.task_name == "default"
|
|
|
186 |
assert "evalplus" in plus_description.citation
|
187 |
|
188 |
|
189 |
+
def test_get_evaluation_metrics(parser):
|
190 |
"""Test evaluation metrics generation for both parsers."""
|
191 |
# Test original HumanEval metrics
|
192 |
metrics = parser.get_evaluation_metrics()
|
tests/test_ifeval_parser.py
CHANGED
@@ -31,7 +31,7 @@ def ifeval_parser():
|
|
31 |
def test_ifeval_parse_entry_creation_valid():
|
32 |
"""Test valid creation of IFEvalParseEntry."""
|
33 |
entry = IFEvalParseEntry.create(
|
34 |
-
|
35 |
answer="", # IFEval doesn't have answers
|
36 |
raw_question="Test instruction",
|
37 |
raw_answer="",
|
@@ -42,7 +42,7 @@ def test_ifeval_parse_entry_creation_valid():
|
|
42 |
)
|
43 |
|
44 |
assert isinstance(entry, IFEvalParseEntry)
|
45 |
-
assert entry.
|
46 |
assert entry.answer == ""
|
47 |
assert entry.key == 1
|
48 |
assert entry.instruction_id_list == ["test_001", "test_002"]
|
|
|
31 |
def test_ifeval_parse_entry_creation_valid():
|
32 |
"""Test valid creation of IFEvalParseEntry."""
|
33 |
entry = IFEvalParseEntry.create(
|
34 |
+
question="Test instruction",
|
35 |
answer="", # IFEval doesn't have answers
|
36 |
raw_question="Test instruction",
|
37 |
raw_answer="",
|
|
|
42 |
)
|
43 |
|
44 |
assert isinstance(entry, IFEvalParseEntry)
|
45 |
+
assert entry.question == "Test instruction"
|
46 |
assert entry.answer == ""
|
47 |
assert entry.key == 1
|
48 |
assert entry.instruction_id_list == ["test_001", "test_002"]
|
tests/test_math_parser.py
CHANGED
@@ -44,7 +44,7 @@ def sample_math_entries():
|
|
44 |
def test_math_parse_entry_creation_valid():
|
45 |
"""Test valid creation of MATHParseEntry with all fields."""
|
46 |
entry = MATHParseEntry.create(
|
47 |
-
|
48 |
answer="Test answer",
|
49 |
raw_question="Test question",
|
50 |
raw_answer="Test solution",
|
@@ -54,7 +54,7 @@ def test_math_parse_entry_creation_valid():
|
|
54 |
)
|
55 |
|
56 |
assert isinstance(entry, MATHParseEntry)
|
57 |
-
assert entry.
|
58 |
assert entry.answer == "Test answer"
|
59 |
assert entry.raw_question == "Test question"
|
60 |
assert entry.raw_answer == "Test solution"
|
@@ -85,9 +85,7 @@ def test_process_entry(math_parser, test_case):
|
|
85 |
entry = math_parser.process_entry(test_case, task_name=test_case["type"])
|
86 |
|
87 |
assert isinstance(entry, MATHParseEntry)
|
88 |
-
|
89 |
-
entry.prompt == f"{math_parser._default_system_prompt}\n{test_case['problem']}"
|
90 |
-
)
|
91 |
assert entry.answer == test_case["solution"]
|
92 |
assert entry.raw_question == test_case["problem"]
|
93 |
assert entry.raw_answer == test_case["solution"]
|
@@ -108,7 +106,6 @@ def test_math_parser_initialization(math_parser):
|
|
108 |
math_parser.get_huggingface_link
|
109 |
== "https://huggingface.co/datasets/lighteval/MATH"
|
110 |
)
|
111 |
-
assert "mathematics problem" in math_parser._default_system_prompt.lower()
|
112 |
|
113 |
|
114 |
def test_get_current_task(math_parser):
|
|
|
44 |
def test_math_parse_entry_creation_valid():
|
45 |
"""Test valid creation of MATHParseEntry with all fields."""
|
46 |
entry = MATHParseEntry.create(
|
47 |
+
question="Test question",
|
48 |
answer="Test answer",
|
49 |
raw_question="Test question",
|
50 |
raw_answer="Test solution",
|
|
|
54 |
)
|
55 |
|
56 |
assert isinstance(entry, MATHParseEntry)
|
57 |
+
assert entry.question == "Test question"
|
58 |
assert entry.answer == "Test answer"
|
59 |
assert entry.raw_question == "Test question"
|
60 |
assert entry.raw_answer == "Test solution"
|
|
|
85 |
entry = math_parser.process_entry(test_case, task_name=test_case["type"])
|
86 |
|
87 |
assert isinstance(entry, MATHParseEntry)
|
88 |
+
|
|
|
|
|
89 |
assert entry.answer == test_case["solution"]
|
90 |
assert entry.raw_question == test_case["problem"]
|
91 |
assert entry.raw_answer == test_case["solution"]
|
|
|
106 |
math_parser.get_huggingface_link
|
107 |
== "https://huggingface.co/datasets/lighteval/MATH"
|
108 |
)
|
|
|
109 |
|
110 |
|
111 |
def test_get_current_task(math_parser):
|
tests/test_mbpp_parser.py
CHANGED
@@ -23,7 +23,7 @@ def parser():
|
|
23 |
def test_mbpp_parse_entry_creation():
|
24 |
"""Test creation of MBPPParseEntry"""
|
25 |
entry = MBPPParseEntry.create(
|
26 |
-
|
27 |
answer="test answer",
|
28 |
raw_question="raw question",
|
29 |
task_id=42,
|
@@ -34,7 +34,7 @@ def test_mbpp_parse_entry_creation():
|
|
34 |
source_file="test.pdf",
|
35 |
)
|
36 |
|
37 |
-
assert entry.
|
38 |
assert entry.answer == "test answer"
|
39 |
assert entry.raw_question == "raw question"
|
40 |
assert entry.raw_answer == "test answer"
|
@@ -49,7 +49,7 @@ def test_mbpp_parse_entry_validation():
|
|
49 |
"""Test validation of required fields"""
|
50 |
with pytest.raises(ValueError, match="Task ID must be an integer"):
|
51 |
MBPPParseEntry.create(
|
52 |
-
|
53 |
answer="test",
|
54 |
raw_question="test",
|
55 |
task_id="not_an_int", # Invalid task_id type
|
@@ -71,8 +71,6 @@ def test_process_entry(parser, sample_entry):
|
|
71 |
assert result.answer == sample_entry["code"]
|
72 |
assert result.test_list == sample_entry["test_list"]
|
73 |
assert result.challenge_test_list == sample_entry["challenge_test_list"]
|
74 |
-
expected_prompt = f"{parser._system_prompt}\n\nTask: {sample_entry['text']}"
|
75 |
-
assert result.prompt == expected_prompt
|
76 |
assert result.task_name == "full"
|
77 |
|
78 |
|
@@ -142,18 +140,6 @@ def test_full_workflow_with_different_splits(parser):
|
|
142 |
assert all(entry.task_name == "full" for entry in train_data)
|
143 |
|
144 |
|
145 |
-
def test_custom_system_prompt():
|
146 |
-
"""Test parser initialization with custom system prompt"""
|
147 |
-
custom_prompt = "Custom system prompt"
|
148 |
-
parser = MBPPDatasetParser(system_prompt=custom_prompt)
|
149 |
-
assert parser._system_prompt == custom_prompt
|
150 |
-
|
151 |
-
|
152 |
-
def test_default_system_prompt(parser):
|
153 |
-
"""Test parser uses default system prompt when none provided"""
|
154 |
-
assert parser._system_prompt == parser._default_system_prompt
|
155 |
-
|
156 |
-
|
157 |
def test_get_dataset_description(parser):
|
158 |
"""Test dataset description generation."""
|
159 |
description = parser.get_dataset_description()
|
|
|
23 |
def test_mbpp_parse_entry_creation():
|
24 |
"""Test creation of MBPPParseEntry"""
|
25 |
entry = MBPPParseEntry.create(
|
26 |
+
question="test question",
|
27 |
answer="test answer",
|
28 |
raw_question="raw question",
|
29 |
task_id=42,
|
|
|
34 |
source_file="test.pdf",
|
35 |
)
|
36 |
|
37 |
+
assert entry.question == "test question"
|
38 |
assert entry.answer == "test answer"
|
39 |
assert entry.raw_question == "raw question"
|
40 |
assert entry.raw_answer == "test answer"
|
|
|
49 |
"""Test validation of required fields"""
|
50 |
with pytest.raises(ValueError, match="Task ID must be an integer"):
|
51 |
MBPPParseEntry.create(
|
52 |
+
question="test",
|
53 |
answer="test",
|
54 |
raw_question="test",
|
55 |
task_id="not_an_int", # Invalid task_id type
|
|
|
71 |
assert result.answer == sample_entry["code"]
|
72 |
assert result.test_list == sample_entry["test_list"]
|
73 |
assert result.challenge_test_list == sample_entry["challenge_test_list"]
|
|
|
|
|
74 |
assert result.task_name == "full"
|
75 |
|
76 |
|
|
|
140 |
assert all(entry.task_name == "full" for entry in train_data)
|
141 |
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def test_get_dataset_description(parser):
|
144 |
"""Test dataset description generation."""
|
145 |
description = parser.get_dataset_description()
|
tests/test_mgsm_parser.py
CHANGED
@@ -47,7 +47,7 @@ def sample_mgsm_entries():
|
|
47 |
def test_mgsm_parse_entry_creation_valid():
|
48 |
"""Test valid creation of MGSMParseEntry with all fields."""
|
49 |
entry = MGSMParseEntry.create(
|
50 |
-
|
51 |
answer="Test answer",
|
52 |
raw_question="Test question",
|
53 |
raw_answer="Test answer",
|
@@ -58,7 +58,7 @@ def test_mgsm_parse_entry_creation_valid():
|
|
58 |
)
|
59 |
|
60 |
assert isinstance(entry, MGSMParseEntry)
|
61 |
-
assert entry.
|
62 |
assert entry.answer == "Test answer"
|
63 |
assert entry.raw_question == "Test question"
|
64 |
assert entry.raw_answer == "Test answer"
|
@@ -168,22 +168,6 @@ def test_supported_languages(mgsm_parser, language):
|
|
168 |
assert entry.numerical_answer == 42
|
169 |
|
170 |
|
171 |
-
def test_system_prompt_override(mgsm_parser):
|
172 |
-
"""Test overriding the default system prompt."""
|
173 |
-
custom_prompt = "Custom system prompt for testing"
|
174 |
-
parser = MGSMDatasetParser(system_prompt=custom_prompt)
|
175 |
-
|
176 |
-
test_entry = {
|
177 |
-
"question": "Test question",
|
178 |
-
"answer": "Test answer",
|
179 |
-
"answer_number": 42,
|
180 |
-
"equation_solution": "42",
|
181 |
-
}
|
182 |
-
|
183 |
-
entry = parser.process_entry(test_entry, task_name="en")
|
184 |
-
assert custom_prompt in entry.prompt
|
185 |
-
|
186 |
-
|
187 |
def test_get_dataset_description(mgsm_parser):
|
188 |
"""Test dataset description generation."""
|
189 |
description = mgsm_parser.get_dataset_description()
|
|
|
47 |
def test_mgsm_parse_entry_creation_valid():
|
48 |
"""Test valid creation of MGSMParseEntry with all fields."""
|
49 |
entry = MGSMParseEntry.create(
|
50 |
+
question="Test question",
|
51 |
answer="Test answer",
|
52 |
raw_question="Test question",
|
53 |
raw_answer="Test answer",
|
|
|
58 |
)
|
59 |
|
60 |
assert isinstance(entry, MGSMParseEntry)
|
61 |
+
assert entry.question == "Test question"
|
62 |
assert entry.answer == "Test answer"
|
63 |
assert entry.raw_question == "Test question"
|
64 |
assert entry.raw_answer == "Test answer"
|
|
|
168 |
assert entry.numerical_answer == 42
|
169 |
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
def test_get_dataset_description(mgsm_parser):
|
172 |
"""Test dataset description generation."""
|
173 |
description = mgsm_parser.get_dataset_description()
|
tests/test_mmlu_parser.py
CHANGED
@@ -70,7 +70,7 @@ def sample_mmlu_pro_entries():
|
|
70 |
def test_mmlu_parse_entry_creation_valid():
|
71 |
"""Test valid creation of MMLUParseEntry."""
|
72 |
entry = MMLUParseEntry.create(
|
73 |
-
|
74 |
answer="A",
|
75 |
raw_question="Test question",
|
76 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
@@ -78,7 +78,7 @@ def test_mmlu_parse_entry_creation_valid():
|
|
78 |
task_name="test_task",
|
79 |
)
|
80 |
assert isinstance(entry, MMLUParseEntry)
|
81 |
-
assert entry.
|
82 |
assert entry.answer == "A"
|
83 |
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
84 |
assert entry.task_name == "test_task"
|
@@ -91,7 +91,7 @@ def test_mmlu_parse_entry_creation_invalid(invalid_answer):
|
|
91 |
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
92 |
):
|
93 |
MMLUParseEntry.create(
|
94 |
-
|
95 |
answer=invalid_answer,
|
96 |
raw_question="Test question",
|
97 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
@@ -106,10 +106,10 @@ def test_process_entry_base(base_parser, sample_mmlu_entries):
|
|
106 |
|
107 |
assert isinstance(entry, MMLUParseEntry)
|
108 |
assert entry.answer == "B" # Index 1 maps to B
|
109 |
-
assert "A. London" in entry.
|
110 |
-
assert "B. Paris" in entry.
|
111 |
-
assert "C. Berlin" in entry.
|
112 |
-
assert "D. Madrid" in entry.
|
113 |
assert entry.raw_question == "What is the capital of France?"
|
114 |
assert entry.raw_choices == ["London", "Paris", "Berlin", "Madrid"]
|
115 |
assert entry.raw_answer == "1"
|
@@ -119,7 +119,7 @@ def test_process_entry_base(base_parser, sample_mmlu_entries):
|
|
119 |
def test_mmlu_pro_parse_entry_creation_valid():
|
120 |
"""Test valid creation of MMLUProParseEntry."""
|
121 |
entry = MMLUProParseEntry.create(
|
122 |
-
|
123 |
answer="E", # MMLU Pro supports up to J
|
124 |
raw_question="Test question",
|
125 |
raw_choices=["choice1", "choice2", "choice3", "choice4", "choice5"],
|
@@ -139,7 +139,7 @@ def test_process_entry_mmlu_pro(mmlu_pro_parser, sample_mmlu_pro_entries):
|
|
139 |
|
140 |
assert isinstance(entry, MMLUProParseEntry)
|
141 |
assert entry.answer == "B" # Index 1 maps to B
|
142 |
-
assert "O(n log n)" in entry.
|
143 |
assert entry.task_name == "computer_science"
|
144 |
assert len(entry.raw_choices) == 6
|
145 |
|
|
|
70 |
def test_mmlu_parse_entry_creation_valid():
|
71 |
"""Test valid creation of MMLUParseEntry."""
|
72 |
entry = MMLUParseEntry.create(
|
73 |
+
question="Test question",
|
74 |
answer="A",
|
75 |
raw_question="Test question",
|
76 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
|
|
78 |
task_name="test_task",
|
79 |
)
|
80 |
assert isinstance(entry, MMLUParseEntry)
|
81 |
+
assert entry.question == "Test question"
|
82 |
assert entry.answer == "A"
|
83 |
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
84 |
assert entry.task_name == "test_task"
|
|
|
91 |
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
92 |
):
|
93 |
MMLUParseEntry.create(
|
94 |
+
question="Test question",
|
95 |
answer=invalid_answer,
|
96 |
raw_question="Test question",
|
97 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
|
|
106 |
|
107 |
assert isinstance(entry, MMLUParseEntry)
|
108 |
assert entry.answer == "B" # Index 1 maps to B
|
109 |
+
assert "A. London" in entry.question
|
110 |
+
assert "B. Paris" in entry.question
|
111 |
+
assert "C. Berlin" in entry.question
|
112 |
+
assert "D. Madrid" in entry.question
|
113 |
assert entry.raw_question == "What is the capital of France?"
|
114 |
assert entry.raw_choices == ["London", "Paris", "Berlin", "Madrid"]
|
115 |
assert entry.raw_answer == "1"
|
|
|
119 |
def test_mmlu_pro_parse_entry_creation_valid():
|
120 |
"""Test valid creation of MMLUProParseEntry."""
|
121 |
entry = MMLUProParseEntry.create(
|
122 |
+
question="Test question",
|
123 |
answer="E", # MMLU Pro supports up to J
|
124 |
raw_question="Test question",
|
125 |
raw_choices=["choice1", "choice2", "choice3", "choice4", "choice5"],
|
|
|
139 |
|
140 |
assert isinstance(entry, MMLUProParseEntry)
|
141 |
assert entry.answer == "B" # Index 1 maps to B
|
142 |
+
assert "O(n log n)" in entry.question
|
143 |
assert entry.task_name == "computer_science"
|
144 |
assert len(entry.raw_choices) == 6
|
145 |
|
tests/test_tmlu_parser.py
CHANGED
@@ -47,7 +47,7 @@ def sample_tmlu_entries():
|
|
47 |
def test_tmlu_parse_entry_creation_valid():
|
48 |
"""Test valid creation of TMLUParseEntry."""
|
49 |
entry = TMLUParseEntry.create(
|
50 |
-
|
51 |
answer="A",
|
52 |
raw_question="Test question",
|
53 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
@@ -57,7 +57,7 @@ def test_tmlu_parse_entry_creation_valid():
|
|
57 |
metadata={"source": "test"},
|
58 |
)
|
59 |
assert isinstance(entry, TMLUParseEntry)
|
60 |
-
assert entry.
|
61 |
assert entry.answer == "A"
|
62 |
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
63 |
assert entry.explanation == "Test explanation"
|
@@ -71,7 +71,7 @@ def test_tmlu_parse_entry_creation_invalid(invalid_answer):
|
|
71 |
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
72 |
):
|
73 |
TMLUParseEntry.create(
|
74 |
-
|
75 |
answer=invalid_answer,
|
76 |
raw_question="Test question",
|
77 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
@@ -140,26 +140,6 @@ def test_different_tasks_parsing(tmlu_parser):
|
|
140 |
assert math_count > 0
|
141 |
|
142 |
|
143 |
-
def test_system_prompt_override(tmlu_parser):
|
144 |
-
"""Test overriding the default system prompt."""
|
145 |
-
custom_prompt = "Custom system prompt for testing"
|
146 |
-
parser = TMLUDatasetParser(system_prompt=custom_prompt)
|
147 |
-
|
148 |
-
test_entry = {
|
149 |
-
"question": "Test question",
|
150 |
-
"A": "Choice A",
|
151 |
-
"B": "Choice B",
|
152 |
-
"C": "Choice C",
|
153 |
-
"D": "Choice D",
|
154 |
-
"answer": "A",
|
155 |
-
"explanation": "Test explanation",
|
156 |
-
"metadata": {"source": "test"},
|
157 |
-
}
|
158 |
-
|
159 |
-
entry = parser.process_entry(test_entry)
|
160 |
-
assert custom_prompt in entry.prompt
|
161 |
-
|
162 |
-
|
163 |
def test_metadata_handling(tmlu_parser, sample_tmlu_entries):
|
164 |
"""Test proper handling of metadata in entries."""
|
165 |
entry = tmlu_parser.process_entry(sample_tmlu_entries[0])
|
|
|
47 |
def test_tmlu_parse_entry_creation_valid():
|
48 |
"""Test valid creation of TMLUParseEntry."""
|
49 |
entry = TMLUParseEntry.create(
|
50 |
+
question="Test question",
|
51 |
answer="A",
|
52 |
raw_question="Test question",
|
53 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
|
|
57 |
metadata={"source": "test"},
|
58 |
)
|
59 |
assert isinstance(entry, TMLUParseEntry)
|
60 |
+
assert entry.question == "Test question"
|
61 |
assert entry.answer == "A"
|
62 |
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
63 |
assert entry.explanation == "Test explanation"
|
|
|
71 |
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
72 |
):
|
73 |
TMLUParseEntry.create(
|
74 |
+
question="Test question",
|
75 |
answer=invalid_answer,
|
76 |
raw_question="Test question",
|
77 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
|
|
140 |
assert math_count > 0
|
141 |
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def test_metadata_handling(tmlu_parser, sample_tmlu_entries):
|
144 |
"""Test proper handling of metadata in entries."""
|
145 |
entry = tmlu_parser.process_entry(sample_tmlu_entries[0])
|
tests/test_tw_legal_parser.py
CHANGED
@@ -35,7 +35,7 @@ def sample_tw_legal_entries():
|
|
35 |
def test_tw_legal_parse_entry_creation_valid():
|
36 |
"""Test valid creation of TWLegalParseEntry."""
|
37 |
entry = TWLegalParseEntry.create(
|
38 |
-
|
39 |
answer="A",
|
40 |
raw_question="Test question",
|
41 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
@@ -43,7 +43,7 @@ def test_tw_legal_parse_entry_creation_valid():
|
|
43 |
task_name="default",
|
44 |
)
|
45 |
assert isinstance(entry, TWLegalParseEntry)
|
46 |
-
assert entry.
|
47 |
assert entry.answer == "A"
|
48 |
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
49 |
|
@@ -55,7 +55,7 @@ def test_tw_legal_parse_entry_creation_invalid(invalid_answer):
|
|
55 |
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
56 |
):
|
57 |
TWLegalParseEntry.create(
|
58 |
-
|
59 |
answer=invalid_answer,
|
60 |
raw_question="Test question",
|
61 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
@@ -70,10 +70,10 @@ def test_process_entry(tw_legal_parser, sample_tw_legal_entries):
|
|
70 |
|
71 |
assert isinstance(entry, TWLegalParseEntry)
|
72 |
assert entry.answer == "D"
|
73 |
-
assert "A. 法人於法令限制內,有享受權利負擔義務之能力" in entry.
|
74 |
-
assert "B. 法人因目的之達到而消滅" in entry.
|
75 |
-
assert "C. 法人非依法律之規定,不得成立" in entry.
|
76 |
-
assert "D. 法人於登記前,即取得權利能力" in entry.
|
77 |
assert entry.raw_question == "依民法規定,下列關於法人之敘述,何者錯誤?"
|
78 |
assert len(entry.raw_choices) == 4
|
79 |
|
@@ -122,24 +122,6 @@ def test_data_parsing(tw_legal_parser):
|
|
122 |
assert all(entry.answer in {"A", "B", "C", "D"} for entry in parsed_data)
|
123 |
|
124 |
|
125 |
-
def test_system_prompt_override(tw_legal_parser):
|
126 |
-
"""Test overriding the default system prompt."""
|
127 |
-
custom_prompt = "Custom system prompt for testing"
|
128 |
-
parser = TWLegalDatasetParser(system_prompt=custom_prompt)
|
129 |
-
|
130 |
-
test_entry = {
|
131 |
-
"question": "Test question",
|
132 |
-
"A": "Choice A",
|
133 |
-
"B": "Choice B",
|
134 |
-
"C": "Choice C",
|
135 |
-
"D": "Choice D",
|
136 |
-
"answer": "A",
|
137 |
-
}
|
138 |
-
|
139 |
-
entry = parser.process_entry(test_entry)
|
140 |
-
assert custom_prompt in entry.prompt
|
141 |
-
|
142 |
-
|
143 |
def test_get_dataset_description(tw_legal_parser):
|
144 |
"""Test getting dataset description for Taiwan Legal parser."""
|
145 |
description = tw_legal_parser.get_dataset_description()
|
|
|
35 |
def test_tw_legal_parse_entry_creation_valid():
|
36 |
"""Test valid creation of TWLegalParseEntry."""
|
37 |
entry = TWLegalParseEntry.create(
|
38 |
+
question="Test question",
|
39 |
answer="A",
|
40 |
raw_question="Test question",
|
41 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
|
|
43 |
task_name="default",
|
44 |
)
|
45 |
assert isinstance(entry, TWLegalParseEntry)
|
46 |
+
assert entry.question == "Test question"
|
47 |
assert entry.answer == "A"
|
48 |
assert entry.raw_choices == ["choice1", "choice2", "choice3", "choice4"]
|
49 |
|
|
|
55 |
ValueError, match="Invalid answer_letter.*must be one of A, B, C, D"
|
56 |
):
|
57 |
TWLegalParseEntry.create(
|
58 |
+
question="Test question",
|
59 |
answer=invalid_answer,
|
60 |
raw_question="Test question",
|
61 |
raw_choices=["choice1", "choice2", "choice3", "choice4"],
|
|
|
70 |
|
71 |
assert isinstance(entry, TWLegalParseEntry)
|
72 |
assert entry.answer == "D"
|
73 |
+
assert "A. 法人於法令限制內,有享受權利負擔義務之能力" in entry.question
|
74 |
+
assert "B. 法人因目的之達到而消滅" in entry.question
|
75 |
+
assert "C. 法人非依法律之規定,不得成立" in entry.question
|
76 |
+
assert "D. 法人於登記前,即取得權利能力" in entry.question
|
77 |
assert entry.raw_question == "依民法規定,下列關於法人之敘述,何者錯誤?"
|
78 |
assert len(entry.raw_choices) == 4
|
79 |
|
|
|
122 |
assert all(entry.answer in {"A", "B", "C", "D"} for entry in parsed_data)
|
123 |
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
def test_get_dataset_description(tw_legal_parser):
|
126 |
"""Test getting dataset description for Taiwan Legal parser."""
|
127 |
description = tw_legal_parser.get_dataset_description()
|