JeffYang52415
commited on
refactor: add category to parser
Browse files- app.py +18 -2
- llmdataparser/base_parser.py +20 -0
- llmdataparser/bbh_parser.py +1 -0
- llmdataparser/gsm8k_parser.py +1 -0
- llmdataparser/humaneval_parser.py +2 -0
- llmdataparser/ifeval_parser.py +1 -0
- llmdataparser/math_parser.py +1 -0
- llmdataparser/mbpp_parser.py +1 -0
- llmdataparser/mgsm_parser.py +1 -0
- llmdataparser/mmlu_parser.py +4 -0
- llmdataparser/tmlu_parser.py +1 -0
- llmdataparser/tw_legal_parser.py +1 -0
app.py
CHANGED
@@ -252,8 +252,24 @@ def update_metric_details(metric_name: str, parser_name: str) -> str:
|
|
252 |
|
253 |
def create_interface() -> gr.Blocks:
|
254 |
"""Create and return the Gradio interface."""
|
255 |
-
with gr.Blocks() as demo:
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
|
258 |
# State management
|
259 |
parser_state = gr.State("")
|
|
|
252 |
|
253 |
def create_interface() -> gr.Blocks:
|
254 |
"""Create and return the Gradio interface."""
|
255 |
+
with gr.Blocks(css="footer {display: none !important}") as demo:
|
256 |
+
# Add header section with purpose and GitHub info
|
257 |
+
gr.Markdown("""
|
258 |
+
# LLM Evaluation Dataset Parser
|
259 |
+
|
260 |
+
### 🎯 Purpose
|
261 |
+
A unified interface for parsing and exploring various LLM benchmark datasets (MMLU, MMLU-Pro, GSM8k, and more).
|
262 |
+
This tool helps researchers and developers to:
|
263 |
+
- Easily explore different benchmark datasets
|
264 |
+
- Access standardized parsing for multiple dataset formats
|
265 |
+
- View dataset descriptions and evaluation metrics
|
266 |
+
|
267 |
+
### 🔗 Links
|
268 |
+
- [GitHub Repository](https://github.com/jeff52415/LLMDataParser)
|
269 |
+
- [Documentation](https://github.com/jeff52415/LLMDataParser#readme)
|
270 |
+
|
271 |
+
---
|
272 |
+
""")
|
273 |
|
274 |
# State management
|
275 |
parser_state = gr.State("")
|
llmdataparser/base_parser.py
CHANGED
@@ -9,6 +9,18 @@ import datasets
|
|
9 |
T = TypeVar("T", bound="ParseEntry")
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
13 |
class ParseEntry:
|
14 |
"""A simple base class for entries, customizable by each dataset parser."""
|
@@ -28,6 +40,7 @@ class DatasetDescription:
|
|
28 |
source: str
|
29 |
language: str
|
30 |
format: str
|
|
|
31 |
characteristics: str
|
32 |
citation: str | None = None
|
33 |
additional_info: dict[str, Any] | None = None
|
@@ -40,16 +53,23 @@ class DatasetDescription:
|
|
40 |
source: str,
|
41 |
language: str,
|
42 |
format: str,
|
|
|
43 |
characteristics: str,
|
44 |
citation: str | None = None,
|
45 |
additional_info: dict[str, Any] | None = None,
|
46 |
) -> "DatasetDescription":
|
|
|
|
|
|
|
|
|
|
|
47 |
return cls(
|
48 |
name=name,
|
49 |
purpose=purpose,
|
50 |
source=source,
|
51 |
language=language,
|
52 |
format=format,
|
|
|
53 |
characteristics=characteristics,
|
54 |
citation=citation,
|
55 |
additional_info=additional_info,
|
|
|
9 |
T = TypeVar("T", bound="ParseEntry")
|
10 |
|
11 |
|
12 |
+
# Add this after the DatasetCategory definition
|
13 |
+
VALID_CATEGORIES = {
|
14 |
+
"Math",
|
15 |
+
"General Knowledge and Reasoning",
|
16 |
+
"Programming",
|
17 |
+
"MultiLingual",
|
18 |
+
"Taiwan",
|
19 |
+
"Advanced Reasoning",
|
20 |
+
"Legal",
|
21 |
+
}
|
22 |
+
|
23 |
+
|
24 |
@dataclass(frozen=True, kw_only=True, slots=True)
|
25 |
class ParseEntry:
|
26 |
"""A simple base class for entries, customizable by each dataset parser."""
|
|
|
40 |
source: str
|
41 |
language: str
|
42 |
format: str
|
43 |
+
category: list[str]
|
44 |
characteristics: str
|
45 |
citation: str | None = None
|
46 |
additional_info: dict[str, Any] | None = None
|
|
|
53 |
source: str,
|
54 |
language: str,
|
55 |
format: str,
|
56 |
+
category: list[str],
|
57 |
characteristics: str,
|
58 |
citation: str | None = None,
|
59 |
additional_info: dict[str, Any] | None = None,
|
60 |
) -> "DatasetDescription":
|
61 |
+
# Validate that all categories are valid DatasetCategory values
|
62 |
+
for item in category:
|
63 |
+
assert (
|
64 |
+
item in VALID_CATEGORIES
|
65 |
+
), f"Category '{item}' is not a valid category. Valid categories are: {VALID_CATEGORIES}"
|
66 |
return cls(
|
67 |
name=name,
|
68 |
purpose=purpose,
|
69 |
source=source,
|
70 |
language=language,
|
71 |
format=format,
|
72 |
+
category=category,
|
73 |
characteristics=characteristics,
|
74 |
citation=citation,
|
75 |
additional_info=additional_info,
|
llmdataparser/bbh_parser.py
CHANGED
@@ -106,6 +106,7 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
|
|
106 |
"significantly improved through chain-of-thought prompting. The dataset "
|
107 |
"includes 23 core tasks plus additional related tasks."
|
108 |
),
|
|
|
109 |
citation=(
|
110 |
"@article{suzgun2022challenging,\n"
|
111 |
" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
|
|
|
106 |
"significantly improved through chain-of-thought prompting. The dataset "
|
107 |
"includes 23 core tasks plus additional related tasks."
|
108 |
),
|
109 |
+
category=["Advanced Reasoning"],
|
110 |
citation=(
|
111 |
"@article{suzgun2022challenging,\n"
|
112 |
" title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
|
llmdataparser/gsm8k_parser.py
CHANGED
@@ -89,6 +89,7 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
|
|
89 |
source="OpenAI",
|
90 |
language="English",
|
91 |
format="Word problems with step-by-step solutions and numerical answers",
|
|
|
92 |
characteristics=(
|
93 |
"Collection of 8.5K grade school math word problems that require "
|
94 |
"multi-step reasoning. Problems gradually increase in difficulty "
|
|
|
89 |
source="OpenAI",
|
90 |
language="English",
|
91 |
format="Word problems with step-by-step solutions and numerical answers",
|
92 |
+
category=["Math"],
|
93 |
characteristics=(
|
94 |
"Collection of 8.5K grade school math word problems that require "
|
95 |
"multi-step reasoning. Problems gradually increase in difficulty "
|
llmdataparser/humaneval_parser.py
CHANGED
@@ -88,6 +88,7 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
|
|
88 |
source="OpenAI",
|
89 |
language="Python",
|
90 |
format="Function signatures with docstrings and unit tests",
|
|
|
91 |
characteristics=(
|
92 |
"Collection of 164 hand-written Python programming problems. Each problem "
|
93 |
"includes a function signature, docstring, example test cases, and hidden unit "
|
@@ -186,6 +187,7 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
|
|
186 |
source="EvalPlus",
|
187 |
language="Python",
|
188 |
format="Function signatures with docstrings and comprehensive test suites",
|
|
|
189 |
characteristics=(
|
190 |
"Significantly enhanced version of HumanEval with 80x more test cases. "
|
191 |
"Includes extensive edge cases, boundary conditions, stress tests, and "
|
|
|
88 |
source="OpenAI",
|
89 |
language="Python",
|
90 |
format="Function signatures with docstrings and unit tests",
|
91 |
+
category=["Programming"],
|
92 |
characteristics=(
|
93 |
"Collection of 164 hand-written Python programming problems. Each problem "
|
94 |
"includes a function signature, docstring, example test cases, and hidden unit "
|
|
|
187 |
source="EvalPlus",
|
188 |
language="Python",
|
189 |
format="Function signatures with docstrings and comprehensive test suites",
|
190 |
+
category=["Programming"],
|
191 |
characteristics=(
|
192 |
"Significantly enhanced version of HumanEval with 80x more test cases. "
|
193 |
"Includes extensive edge cases, boundary conditions, stress tests, and "
|
llmdataparser/ifeval_parser.py
CHANGED
@@ -90,6 +90,7 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
|
|
90 |
source="Google Research",
|
91 |
language="English (BCP-47 en)",
|
92 |
format="Verifiable instruction prompts with automated evaluation criteria",
|
|
|
93 |
characteristics=(
|
94 |
"Collection of approximately 500 verifiable instructions designed to evaluate "
|
95 |
"language models' instruction-following capabilities. Instructions include "
|
|
|
90 |
source="Google Research",
|
91 |
language="English (BCP-47 en)",
|
92 |
format="Verifiable instruction prompts with automated evaluation criteria",
|
93 |
+
category=["Programming"],
|
94 |
characteristics=(
|
95 |
"Collection of approximately 500 verifiable instructions designed to evaluate "
|
96 |
"language models' instruction-following capabilities. Instructions include "
|
llmdataparser/math_parser.py
CHANGED
@@ -97,6 +97,7 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
|
|
97 |
source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
|
98 |
language="English",
|
99 |
format="Competition mathematics problems with step-by-step solutions",
|
|
|
100 |
characteristics=(
|
101 |
"Collection of 12,500 challenging competition mathematics problems designed to "
|
102 |
"evaluate mathematical reasoning. Problems include step-by-step solutions that "
|
|
|
97 |
source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
|
98 |
language="English",
|
99 |
format="Competition mathematics problems with step-by-step solutions",
|
100 |
+
category=["Math"],
|
101 |
characteristics=(
|
102 |
"Collection of 12,500 challenging competition mathematics problems designed to "
|
103 |
"evaluate mathematical reasoning. Problems include step-by-step solutions that "
|
llmdataparser/mbpp_parser.py
CHANGED
@@ -95,6 +95,7 @@ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
|
|
95 |
purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
|
96 |
source="https://github.com/google-research/google-research/tree/master/mbpp",
|
97 |
language="English and Python",
|
|
|
98 |
format="Task descriptions in English with corresponding Python solutions and automated test cases",
|
99 |
characteristics=(
|
100 |
"Contains approximately 1,000 crowd-sourced Python programming problems "
|
|
|
95 |
purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
|
96 |
source="https://github.com/google-research/google-research/tree/master/mbpp",
|
97 |
language="English and Python",
|
98 |
+
category=["Programming"],
|
99 |
format="Task descriptions in English with corresponding Python solutions and automated test cases",
|
100 |
characteristics=(
|
101 |
"Contains approximately 1,000 crowd-sourced Python programming problems "
|
llmdataparser/mgsm_parser.py
CHANGED
@@ -106,6 +106,7 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
|
|
106 |
source="https://huggingface.co/datasets/juletxara/mgsm",
|
107 |
language="Multilingual (11 languages)",
|
108 |
format="Word problems with numerical answers and solution steps",
|
|
|
109 |
characteristics=(
|
110 |
"Human-translated version of 250 GSM8K problems into 10 additional languages. "
|
111 |
"Each problem includes the original question from GSM8K, its translations, "
|
|
|
106 |
source="https://huggingface.co/datasets/juletxara/mgsm",
|
107 |
language="Multilingual (11 languages)",
|
108 |
format="Word problems with numerical answers and solution steps",
|
109 |
+
category=["Math", "MultiLingual"],
|
110 |
characteristics=(
|
111 |
"Human-translated version of 250 GSM8K problems into 10 additional languages. "
|
112 |
"Each problem includes the original question from GSM8K, its translations, "
|
llmdataparser/mmlu_parser.py
CHANGED
@@ -212,6 +212,7 @@ class BaseMMLUDatasetParser(MMLUDatasetParser):
|
|
212 |
purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
|
213 |
source="https://huggingface.co/datasets/cais/mmlu",
|
214 |
language="English",
|
|
|
215 |
format="Multiple choice questions with four options (A, B, C, D)",
|
216 |
characteristics=(
|
217 |
"Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
|
@@ -332,6 +333,7 @@ class MMLUReduxDatasetParser(MMLUDatasetParser):
|
|
332 |
source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
|
333 |
language="English",
|
334 |
format="Multiple choice questions with four options (A, B, C, D)",
|
|
|
335 |
characteristics=(
|
336 |
"A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
|
337 |
"manually re-annotated to identify and classify various types of errors. "
|
@@ -494,6 +496,7 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
|
|
494 |
purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
|
495 |
source="https://huggingface.co/datasets/ikala/tmmluplus",
|
496 |
language="Traditional Chinese",
|
|
|
497 |
format="Multiple choice questions with four options (A, B, C, D)",
|
498 |
characteristics=(
|
499 |
"A comprehensive evaluation benchmark featuring 66 subjects from elementary "
|
@@ -621,6 +624,7 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
|
|
621 |
purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
|
622 |
source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
|
623 |
language="English",
|
|
|
624 |
format="Multiple choice questions with up to 10 options (expanded from original 4)",
|
625 |
characteristics=(
|
626 |
"A more challenging version of MMLU containing 12K complex questions across various "
|
|
|
212 |
purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
|
213 |
source="https://huggingface.co/datasets/cais/mmlu",
|
214 |
language="English",
|
215 |
+
category=["General Knowledge and Reasoning"],
|
216 |
format="Multiple choice questions with four options (A, B, C, D)",
|
217 |
characteristics=(
|
218 |
"Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
|
|
|
333 |
source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
|
334 |
language="English",
|
335 |
format="Multiple choice questions with four options (A, B, C, D)",
|
336 |
+
category=["General Knowledge and Reasoning"],
|
337 |
characteristics=(
|
338 |
"A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
|
339 |
"manually re-annotated to identify and classify various types of errors. "
|
|
|
496 |
purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
|
497 |
source="https://huggingface.co/datasets/ikala/tmmluplus",
|
498 |
language="Traditional Chinese",
|
499 |
+
category=["General Knowledge and Reasoning", "Taiwan"],
|
500 |
format="Multiple choice questions with four options (A, B, C, D)",
|
501 |
characteristics=(
|
502 |
"A comprehensive evaluation benchmark featuring 66 subjects from elementary "
|
|
|
624 |
purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
|
625 |
source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
|
626 |
language="English",
|
627 |
+
category=["General Knowledge and Reasoning", "Advanced Reasoning"],
|
628 |
format="Multiple choice questions with up to 10 options (expanded from original 4)",
|
629 |
characteristics=(
|
630 |
"A more challenging version of MMLU containing 12K complex questions across various "
|
llmdataparser/tmlu_parser.py
CHANGED
@@ -130,6 +130,7 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
|
|
130 |
language="Traditional Chinese",
|
131 |
purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
|
132 |
source="Various Taiwan standardized tests and professional certifications",
|
|
|
133 |
format="Multiple choice questions (A/B/C/D)",
|
134 |
characteristics=(
|
135 |
"Covers various subjects including Advanced Subjects Test (AST), "
|
|
|
130 |
language="Traditional Chinese",
|
131 |
purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
|
132 |
source="Various Taiwan standardized tests and professional certifications",
|
133 |
+
category=["Taiwan", "General Knowledge and Reasoning"],
|
134 |
format="Multiple choice questions (A/B/C/D)",
|
135 |
characteristics=(
|
136 |
"Covers various subjects including Advanced Subjects Test (AST), "
|
llmdataparser/tw_legal_parser.py
CHANGED
@@ -82,6 +82,7 @@ class TWLegalDatasetParser(HuggingFaceDatasetParser[TWLegalParseEntry]):
|
|
82 |
language="Traditional Chinese",
|
83 |
purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
|
84 |
source="Taiwan Bar Examination questions",
|
|
|
85 |
format="Multiple choice questions (A/B/C/D)",
|
86 |
characteristics=(
|
87 |
"Contains questions from Taiwan's bar examination, testing understanding "
|
|
|
82 |
language="Traditional Chinese",
|
83 |
purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
|
84 |
source="Taiwan Bar Examination questions",
|
85 |
+
category=["Taiwan", "General Knowledge and Reasoning", "Legal"],
|
86 |
format="Multiple choice questions (A/B/C/D)",
|
87 |
characteristics=(
|
88 |
"Contains questions from Taiwan's bar examination, testing understanding "
|