JeffYang52415 commited on
Commit
a06316f
·
unverified ·
1 Parent(s): 37cb834

refactor: add category to parser

Browse files
app.py CHANGED
@@ -252,8 +252,24 @@ def update_metric_details(metric_name: str, parser_name: str) -> str:
252
 
253
  def create_interface() -> gr.Blocks:
254
  """Create and return the Gradio interface."""
255
- with gr.Blocks() as demo:
256
- gr.Markdown("# LLM Evaluation Dataset Parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  # State management
259
  parser_state = gr.State("")
 
252
 
253
  def create_interface() -> gr.Blocks:
254
  """Create and return the Gradio interface."""
255
+ with gr.Blocks(css="footer {display: none !important}") as demo:
256
+ # Add header section with purpose and GitHub info
257
+ gr.Markdown("""
258
+ # LLM Evaluation Dataset Parser
259
+
260
+ ### 🎯 Purpose
261
+ A unified interface for parsing and exploring various LLM benchmark datasets (MMLU, MMLU-Pro, GSM8k, and more).
262
+ This tool helps researchers and developers to:
263
+ - Easily explore different benchmark datasets
264
+ - Access standardized parsing for multiple dataset formats
265
+ - View dataset descriptions and evaluation metrics
266
+
267
+ ### 🔗 Links
268
+ - [GitHub Repository](https://github.com/jeff52415/LLMDataParser)
269
+ - [Documentation](https://github.com/jeff52415/LLMDataParser#readme)
270
+
271
+ ---
272
+ """)
273
 
274
  # State management
275
  parser_state = gr.State("")
llmdataparser/base_parser.py CHANGED
@@ -9,6 +9,18 @@ import datasets
9
  T = TypeVar("T", bound="ParseEntry")
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  @dataclass(frozen=True, kw_only=True, slots=True)
13
  class ParseEntry:
14
  """A simple base class for entries, customizable by each dataset parser."""
@@ -28,6 +40,7 @@ class DatasetDescription:
28
  source: str
29
  language: str
30
  format: str
 
31
  characteristics: str
32
  citation: str | None = None
33
  additional_info: dict[str, Any] | None = None
@@ -40,16 +53,23 @@ class DatasetDescription:
40
  source: str,
41
  language: str,
42
  format: str,
 
43
  characteristics: str,
44
  citation: str | None = None,
45
  additional_info: dict[str, Any] | None = None,
46
  ) -> "DatasetDescription":
 
 
 
 
 
47
  return cls(
48
  name=name,
49
  purpose=purpose,
50
  source=source,
51
  language=language,
52
  format=format,
 
53
  characteristics=characteristics,
54
  citation=citation,
55
  additional_info=additional_info,
 
9
  T = TypeVar("T", bound="ParseEntry")
10
 
11
 
12
+ # Add this after the DatasetCategory definition
13
+ VALID_CATEGORIES = {
14
+ "Math",
15
+ "General Knowledge and Reasoning",
16
+ "Programming",
17
+ "MultiLingual",
18
+ "Taiwan",
19
+ "Advanced Reasoning",
20
+ "Legal",
21
+ }
22
+
23
+
24
  @dataclass(frozen=True, kw_only=True, slots=True)
25
  class ParseEntry:
26
  """A simple base class for entries, customizable by each dataset parser."""
 
40
  source: str
41
  language: str
42
  format: str
43
+ category: list[str]
44
  characteristics: str
45
  citation: str | None = None
46
  additional_info: dict[str, Any] | None = None
 
53
  source: str,
54
  language: str,
55
  format: str,
56
+ category: list[str],
57
  characteristics: str,
58
  citation: str | None = None,
59
  additional_info: dict[str, Any] | None = None,
60
  ) -> "DatasetDescription":
61
+ # Validate that all categories are valid DatasetCategory values
62
+ for item in category:
63
+ assert (
64
+ item in VALID_CATEGORIES
65
+ ), f"Category '{item}' is not a valid category. Valid categories are: {VALID_CATEGORIES}"
66
  return cls(
67
  name=name,
68
  purpose=purpose,
69
  source=source,
70
  language=language,
71
  format=format,
72
+ category=category,
73
  characteristics=characteristics,
74
  citation=citation,
75
  additional_info=additional_info,
llmdataparser/bbh_parser.py CHANGED
@@ -106,6 +106,7 @@ class BBHDatasetParser(HuggingFaceDatasetParser[BBHParseEntry]):
106
  "significantly improved through chain-of-thought prompting. The dataset "
107
  "includes 23 core tasks plus additional related tasks."
108
  ),
 
109
  citation=(
110
  "@article{suzgun2022challenging,\n"
111
  " title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
 
106
  "significantly improved through chain-of-thought prompting. The dataset "
107
  "includes 23 core tasks plus additional related tasks."
108
  ),
109
+ category=["Advanced Reasoning"],
110
  citation=(
111
  "@article{suzgun2022challenging,\n"
112
  " title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},\n"
llmdataparser/gsm8k_parser.py CHANGED
@@ -89,6 +89,7 @@ class GSM8KDatasetParser(HuggingFaceDatasetParser[GSM8KParseEntry]):
89
  source="OpenAI",
90
  language="English",
91
  format="Word problems with step-by-step solutions and numerical answers",
 
92
  characteristics=(
93
  "Collection of 8.5K grade school math word problems that require "
94
  "multi-step reasoning. Problems gradually increase in difficulty "
 
89
  source="OpenAI",
90
  language="English",
91
  format="Word problems with step-by-step solutions and numerical answers",
92
+ category=["Math"],
93
  characteristics=(
94
  "Collection of 8.5K grade school math word problems that require "
95
  "multi-step reasoning. Problems gradually increase in difficulty "
llmdataparser/humaneval_parser.py CHANGED
@@ -88,6 +88,7 @@ class HumanEvalDatasetParser(HuggingFaceDatasetParser[HumanEvalParseEntry]):
88
  source="OpenAI",
89
  language="Python",
90
  format="Function signatures with docstrings and unit tests",
 
91
  characteristics=(
92
  "Collection of 164 hand-written Python programming problems. Each problem "
93
  "includes a function signature, docstring, example test cases, and hidden unit "
@@ -186,6 +187,7 @@ class HumanEvalDatasetPlusParser(HumanEvalDatasetParser):
186
  source="EvalPlus",
187
  language="Python",
188
  format="Function signatures with docstrings and comprehensive test suites",
 
189
  characteristics=(
190
  "Significantly enhanced version of HumanEval with 80x more test cases. "
191
  "Includes extensive edge cases, boundary conditions, stress tests, and "
 
88
  source="OpenAI",
89
  language="Python",
90
  format="Function signatures with docstrings and unit tests",
91
+ category=["Programming"],
92
  characteristics=(
93
  "Collection of 164 hand-written Python programming problems. Each problem "
94
  "includes a function signature, docstring, example test cases, and hidden unit "
 
187
  source="EvalPlus",
188
  language="Python",
189
  format="Function signatures with docstrings and comprehensive test suites",
190
+ category=["Programming"],
191
  characteristics=(
192
  "Significantly enhanced version of HumanEval with 80x more test cases. "
193
  "Includes extensive edge cases, boundary conditions, stress tests, and "
llmdataparser/ifeval_parser.py CHANGED
@@ -90,6 +90,7 @@ class IFEvalDatasetParser(HuggingFaceDatasetParser[IFEvalParseEntry]):
90
  source="Google Research",
91
  language="English (BCP-47 en)",
92
  format="Verifiable instruction prompts with automated evaluation criteria",
 
93
  characteristics=(
94
  "Collection of approximately 500 verifiable instructions designed to evaluate "
95
  "language models' instruction-following capabilities. Instructions include "
 
90
  source="Google Research",
91
  language="English (BCP-47 en)",
92
  format="Verifiable instruction prompts with automated evaluation criteria",
93
+ category=["Programming"],
94
  characteristics=(
95
  "Collection of approximately 500 verifiable instructions designed to evaluate "
96
  "language models' instruction-following capabilities. Instructions include "
llmdataparser/math_parser.py CHANGED
@@ -97,6 +97,7 @@ class MATHDatasetParser(HuggingFaceDatasetParser[MATHParseEntry]):
97
  source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
98
  language="English",
99
  format="Competition mathematics problems with step-by-step solutions",
 
100
  characteristics=(
101
  "Collection of 12,500 challenging competition mathematics problems designed to "
102
  "evaluate mathematical reasoning. Problems include step-by-step solutions that "
 
97
  source="Hendrycks et al., UC Berkeley (NeurIPS 2021)",
98
  language="English",
99
  format="Competition mathematics problems with step-by-step solutions",
100
+ category=["Math"],
101
  characteristics=(
102
  "Collection of 12,500 challenging competition mathematics problems designed to "
103
  "evaluate mathematical reasoning. Problems include step-by-step solutions that "
llmdataparser/mbpp_parser.py CHANGED
@@ -95,6 +95,7 @@ class MBPPDatasetParser(HuggingFaceDatasetParser[MBPPParseEntry]):
95
  purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
96
  source="https://github.com/google-research/google-research/tree/master/mbpp",
97
  language="English and Python",
 
98
  format="Task descriptions in English with corresponding Python solutions and automated test cases",
99
  characteristics=(
100
  "Contains approximately 1,000 crowd-sourced Python programming problems "
 
95
  purpose="A benchmark for evaluating code generation capabilities using entry-level Python programming problems",
96
  source="https://github.com/google-research/google-research/tree/master/mbpp",
97
  language="English and Python",
98
+ category=["Programming"],
99
  format="Task descriptions in English with corresponding Python solutions and automated test cases",
100
  characteristics=(
101
  "Contains approximately 1,000 crowd-sourced Python programming problems "
llmdataparser/mgsm_parser.py CHANGED
@@ -106,6 +106,7 @@ class MGSMDatasetParser(HuggingFaceDatasetParser[MGSMParseEntry]):
106
  source="https://huggingface.co/datasets/juletxara/mgsm",
107
  language="Multilingual (11 languages)",
108
  format="Word problems with numerical answers and solution steps",
 
109
  characteristics=(
110
  "Human-translated version of 250 GSM8K problems into 10 additional languages. "
111
  "Each problem includes the original question from GSM8K, its translations, "
 
106
  source="https://huggingface.co/datasets/juletxara/mgsm",
107
  language="Multilingual (11 languages)",
108
  format="Word problems with numerical answers and solution steps",
109
+ category=["Math", "MultiLingual"],
110
  characteristics=(
111
  "Human-translated version of 250 GSM8K problems into 10 additional languages. "
112
  "Each problem includes the original question from GSM8K, its translations, "
llmdataparser/mmlu_parser.py CHANGED
@@ -212,6 +212,7 @@ class BaseMMLUDatasetParser(MMLUDatasetParser):
212
  purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
213
  source="https://huggingface.co/datasets/cais/mmlu",
214
  language="English",
 
215
  format="Multiple choice questions with four options (A, B, C, D)",
216
  characteristics=(
217
  "Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
@@ -332,6 +333,7 @@ class MMLUReduxDatasetParser(MMLUDatasetParser):
332
  source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
333
  language="English",
334
  format="Multiple choice questions with four options (A, B, C, D)",
 
335
  characteristics=(
336
  "A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
337
  "manually re-annotated to identify and classify various types of errors. "
@@ -494,6 +496,7 @@ class TMMLUPlusDatasetParser(MMLUDatasetParser):
494
  purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
495
  source="https://huggingface.co/datasets/ikala/tmmluplus",
496
  language="Traditional Chinese",
 
497
  format="Multiple choice questions with four options (A, B, C, D)",
498
  characteristics=(
499
  "A comprehensive evaluation benchmark featuring 66 subjects from elementary "
@@ -621,6 +624,7 @@ class MMLUProDatasetParser(HuggingFaceDatasetParser[MMLUProParseEntry]):
621
  purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
622
  source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
623
  language="English",
 
624
  format="Multiple choice questions with up to 10 options (expanded from original 4)",
625
  characteristics=(
626
  "A more challenging version of MMLU containing 12K complex questions across various "
 
212
  purpose="Evaluate models' extensive world knowledge and problem-solving abilities across diverse branches of knowledge",
213
  source="https://huggingface.co/datasets/cais/mmlu",
214
  language="English",
215
+ category=["General Knowledge and Reasoning"],
216
  format="Multiple choice questions with four options (A, B, C, D)",
217
  characteristics=(
218
  "Comprehensive evaluation benchmark spanning humanities, social sciences, hard sciences, "
 
333
  source="https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux",
334
  language="English",
335
  format="Multiple choice questions with four options (A, B, C, D)",
336
+ category=["General Knowledge and Reasoning"],
337
  characteristics=(
338
  "A carefully curated subset of 3,000 questions across 30 MMLU subjects, "
339
  "manually re-annotated to identify and classify various types of errors. "
 
496
  purpose="Evaluate language models' understanding and reasoning capabilities in Traditional Chinese across diverse subjects",
497
  source="https://huggingface.co/datasets/ikala/tmmluplus",
498
  language="Traditional Chinese",
499
+ category=["General Knowledge and Reasoning", "Taiwan"],
500
  format="Multiple choice questions with four options (A, B, C, D)",
501
  characteristics=(
502
  "A comprehensive evaluation benchmark featuring 66 subjects from elementary "
 
624
  purpose="Provide a more robust and challenging multi-task language understanding benchmark with enhanced reasoning requirements",
625
  source="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
626
  language="English",
627
+ category=["General Knowledge and Reasoning", "Advanced Reasoning"],
628
  format="Multiple choice questions with up to 10 options (expanded from original 4)",
629
  characteristics=(
630
  "A more challenging version of MMLU containing 12K complex questions across various "
llmdataparser/tmlu_parser.py CHANGED
@@ -130,6 +130,7 @@ class TMLUDatasetParser(HuggingFaceDatasetParser[TMLUParseEntry]):
130
  language="Traditional Chinese",
131
  purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
132
  source="Various Taiwan standardized tests and professional certifications",
 
133
  format="Multiple choice questions (A/B/C/D)",
134
  characteristics=(
135
  "Covers various subjects including Advanced Subjects Test (AST), "
 
130
  language="Traditional Chinese",
131
  purpose="Evaluate models on Taiwan-specific educational and professional knowledge",
132
  source="Various Taiwan standardized tests and professional certifications",
133
+ category=["Taiwan", "General Knowledge and Reasoning"],
134
  format="Multiple choice questions (A/B/C/D)",
135
  characteristics=(
136
  "Covers various subjects including Advanced Subjects Test (AST), "
llmdataparser/tw_legal_parser.py CHANGED
@@ -82,6 +82,7 @@ class TWLegalDatasetParser(HuggingFaceDatasetParser[TWLegalParseEntry]):
82
  language="Traditional Chinese",
83
  purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
84
  source="Taiwan Bar Examination questions",
 
85
  format="Multiple choice questions (A/B/C/D)",
86
  characteristics=(
87
  "Contains questions from Taiwan's bar examination, testing understanding "
 
82
  language="Traditional Chinese",
83
  purpose="Evaluate models on Taiwan-specific legal knowledge and understanding",
84
  source="Taiwan Bar Examination questions",
85
+ category=["Taiwan", "General Knowledge and Reasoning", "Legal"],
86
  format="Multiple choice questions (A/B/C/D)",
87
  characteristics=(
88
  "Contains questions from Taiwan's bar examination, testing understanding "