JeffYang52415 commited on
Commit
3d203ac
·
unverified ·
1 Parent(s): 4f7957f

docs: add new parser

Browse files
docs/adding_new_parser.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adding a New Dataset Parser
2
+
3
+ This guide explains how to add a new dataset parser to the llmdataparser library. The library is designed to make it easy to add support for new datasets while maintaining consistent interfaces and functionality.
4
+
5
+ ## Step-by-Step Guide
6
+
7
+ ### 1. Create a New Parser Class
8
+
9
+ Create a new file `your_dataset_parser.py` in the `llmdataparser` folder. Your parser should inherit from `HuggingFaceDatasetParser[T]` where T is your custom entry type.
10
+
11
+ ```python
12
+ from llmdataparser.base_parser import (
13
+ DatasetDescription,
14
+ EvaluationMetric,
15
+ HuggingFaceDatasetParser,
16
+ HuggingFaceParseEntry,
17
+ )
18
+
19
+ @dataclass(frozen=True, kw_only=True, slots=True)
20
+ class YourDatasetParseEntry(HuggingFaceParseEntry):
21
+ """Custom entry class for your dataset."""
22
+ # Add any additional fields specific to your dataset
23
+ custom_field: str
24
+
25
+ @classmethod
26
+ def create(cls, prompt: str, answer: str, raw_question: str,
27
+ raw_answer: str, task_name: str, custom_field: str) -> "YourDatasetParseEntry":
28
+ return cls(
29
+ prompt=prompt,
30
+ answer=answer,
31
+ raw_question=raw_question,
32
+ raw_answer=raw_answer,
33
+ task_name=task_name,
34
+ custom_field=custom_field
35
+ )
36
+
37
+ class YourDatasetParser(HuggingFaceDatasetParser[YourDatasetParseEntry]):
38
+ """Parser for your dataset."""
39
+
40
+ # Required class variables
41
+ _data_source = "huggingface/your-dataset"
42
+ _default_task = "default"
43
+ _task_names = ["task1", "task2", "task3"]
44
+ _default_system_prompt = YOUR_SYSTEM_PROMPT
45
+ ```
46
+
47
+ ### 2. Define System Prompt
48
+
49
+ Add your system prompt to `llmdataparser/prompts.py`:
50
+
51
+ ```python
52
+ YOUR_SYSTEM_PROMPT: Final[str] = textwrap.dedent(
53
+ """\
54
+ You are an expert in [your domain]. Your task is to [describe the task].
55
+
56
+ Instructions:
57
+ 1. [First instruction]
58
+ 2. [Second instruction]
59
+ ...
60
+ """
61
+ )
62
+ ```
63
+
64
+ ### 3. Implement Required Methods
65
+
66
+ Your parser needs to implement these key methods:
67
+
68
+ ```python
69
+ def process_entry(
70
+ self,
71
+ row: dict[str, Any],
72
+ task_name: str | None = None,
73
+ **kwargs: Any
74
+ ) -> YourDatasetParseEntry:
75
+ """Process a single dataset entry."""
76
+ # Extract data from the row
77
+ raw_question = row["question"]
78
+ raw_answer = row["answer"]
79
+ task = task_name or self._get_current_task(row)
80
+
81
+ # Format the prompt
82
+ prompt = f"{self._system_prompt}\nQuestion: {raw_question}\nAnswer:"
83
+
84
+ return YourDatasetParseEntry.create(
85
+ prompt=prompt,
86
+ answer=raw_answer,
87
+ raw_question=raw_question,
88
+ raw_answer=raw_answer,
89
+ task_name=task,
90
+ custom_field=row["custom_field"]
91
+ )
92
+
93
+ def get_dataset_description(self) -> DatasetDescription:
94
+ """Returns description of your dataset."""
95
+ return DatasetDescription.create(
96
+ name="Your Dataset Name",
97
+ purpose="Purpose of the dataset",
98
+ source="Dataset source/URL",
99
+ language="Dataset language",
100
+ format="Data format (e.g., multiple choice, free text)",
101
+ characteristics="Key characteristics of the dataset",
102
+ citation="Dataset citation if available"
103
+ )
104
+
105
+ def get_evaluation_metrics(self) -> list[EvaluationMetric]:
106
+ """Returns recommended evaluation metrics."""
107
+ return [
108
+ EvaluationMetric.create(
109
+ name="metric_name",
110
+ type="metric_type",
111
+ description="Metric description",
112
+ implementation="implementation_details",
113
+ primary=True
114
+ )
115
+ ]
116
+ ```
117
+
118
+ ### 4. Add Example Usage
119
+
120
+ Add example usage at the bottom of your parser file:
121
+
122
+ ```python
123
+ if __name__ == "__main__":
124
+ # Example usage
125
+ parser = YourDatasetParser()
126
+ parser.load()
127
+ parser.parse()
128
+
129
+ # Get parsed data
130
+ parsed_data = parser.get_parsed_data
131
+
132
+ # Print example entry
133
+ if parsed_data:
134
+ example = parsed_data[0]
135
+ print("\nExample parsed entry:")
136
+ print(f"Question: {example.raw_question}")
137
+ print(f"Answer: {example.answer}")
138
+ ```
139
+
140
+ ### 5. Create Tests
141
+
142
+ Create a test file `tests/test_your_dataset_parser.py`:
143
+
144
+ ```python
145
+ import pytest
146
+ from llmdataparser.your_dataset_parser import YourDatasetParser, YourDatasetParseEntry
147
+
148
+ def test_parser_initialization():
149
+ parser = YourDatasetParser()
150
+ assert parser._data_source == "huggingface/your-dataset"
151
+ assert parser._default_task == "default"
152
+ assert "task1" in parser._task_names
153
+
154
+ def test_process_entry():
155
+ parser = YourDatasetParser()
156
+ sample_row = {
157
+ "question": "Sample question",
158
+ "answer": "Sample answer",
159
+ "custom_field": "Custom value"
160
+ }
161
+
162
+ entry = parser.process_entry(sample_row)
163
+ assert isinstance(entry, YourDatasetParseEntry)
164
+ assert entry.raw_question == "Sample question"
165
+ assert entry.custom_field == "Custom value"
166
+ ```
167
+
168
+ ## Best Practices
169
+
170
+ 1. **Type Safety**: Use type hints consistently and ensure your parser is properly typed.
171
+ 1. **Documentation**: Add clear docstrings and comments explaining your parser's functionality.
172
+ 1. **Error Handling**: Include appropriate error checking and validation.
173
+ 1. **Testing**: Write comprehensive tests covering different scenarios.
174
+ 1. **System Prompt**: Design your system prompt carefully to guide the model effectively.
175
+
176
+ ## Examples
177
+
178
+ Look at existing parsers for reference:
179
+
180
+ - `mmlu_parser.py` for multiple-choice questions
181
+ - `gsm8k_parser.py` for math word problems
182
+ - `humaneval_parser.py` for code generation tasks
183
+
184
+ ## Common Patterns
185
+
186
+ 1. **Parse Entry Class**: Create a custom parse entry class if you need additional fields.
187
+ 1. **Task Names**: Define all available tasks in `_task_names`.
188
+ 1. **System Prompt**: Write clear instructions in the system prompt.
189
+ 1. **Process Entry**: Handle data extraction and formatting in `process_entry`.
190
+ 1. **Dataset Description**: Provide comprehensive dataset information.
191
+ 1. **Evaluation Metrics**: Define appropriate metrics for your dataset.
192
+
193
+ ## Testing Your Parser
194
+
195
+ 1. Run the example usage code to verify basic functionality
196
+ 1. Run pytest to execute your test cases
197
+ 1. Try different dataset splits and tasks
198
+ 1. Verify the parsed output format
199
+ 1. Check error handling with invalid inputs
llmdataparser/__init__.py CHANGED
@@ -10,7 +10,7 @@ from .math_parser import MATHDatasetParser
10
  from .mbpp_parser import MBPPDatasetParser
11
  from .mgsm_parser import MGSMDatasetParser
12
  from .mmlu_parser import (
13
- MMLUDatasetParser,
14
  MMLUProDatasetParser,
15
  MMLUReduxDatasetParser,
16
  TMMLUPlusDatasetParser,
@@ -44,7 +44,7 @@ class ParserRegistry:
44
 
45
 
46
  # Register parsers
47
- ParserRegistry.register_parser("mmlu", MMLUDatasetParser)
48
  ParserRegistry.register_parser("mmlupro", MMLUProDatasetParser)
49
  ParserRegistry.register_parser("mmluredux", MMLUReduxDatasetParser)
50
  ParserRegistry.register_parser("tmmluplus", TMMLUPlusDatasetParser)
 
10
  from .mbpp_parser import MBPPDatasetParser
11
  from .mgsm_parser import MGSMDatasetParser
12
  from .mmlu_parser import (
13
+ BaseMMLUDatasetParser,
14
  MMLUProDatasetParser,
15
  MMLUReduxDatasetParser,
16
  TMMLUPlusDatasetParser,
 
44
 
45
 
46
  # Register parsers
47
+ ParserRegistry.register_parser("mmlu", BaseMMLUDatasetParser)
48
  ParserRegistry.register_parser("mmlupro", MMLUProDatasetParser)
49
  ParserRegistry.register_parser("mmluredux", MMLUReduxDatasetParser)
50
  ParserRegistry.register_parser("tmmluplus", TMMLUPlusDatasetParser)
llmdataparser/base_parser.py CHANGED
@@ -119,20 +119,13 @@ class DatasetParser(Generic[T], ABC):
119
  T: The processed entry, typically an instance of a subclass of ParseEntry.
120
  """
121
 
 
122
  def get_dataset_description(self) -> DatasetDescription:
123
  """Returns a standardized description of the dataset."""
124
- return DatasetDescription(
125
- name="Unknown",
126
- purpose="Not specified",
127
- source="Not specified",
128
- language="Not specified",
129
- format="Not specified",
130
- characteristics="Not specified",
131
- )
132
 
 
133
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
134
  """Returns the recommended evaluation metrics for the dataset."""
135
- return []
136
 
137
 
138
  @dataclass(frozen=True, kw_only=True, slots=True)
 
119
  T: The processed entry, typically an instance of a subclass of ParseEntry.
120
  """
121
 
122
+ @abstractmethod
123
  def get_dataset_description(self) -> DatasetDescription:
124
  """Returns a standardized description of the dataset."""
 
 
 
 
 
 
 
 
125
 
126
+ @abstractmethod
127
  def get_evaluation_metrics(self) -> list[EvaluationMetric]:
128
  """Returns the recommended evaluation metrics for the dataset."""
 
129
 
130
 
131
  @dataclass(frozen=True, kw_only=True, slots=True)