victormiller commited on
Commit
583d7c5
1 Parent(s): 24b53c0

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +320 -9
curated.py CHANGED
@@ -11,26 +11,337 @@ import plotly.express as px
11
 
12
  filtering_process = Div(
13
  Section(
14
- H3("Title"),
15
  H4("Download and Extraction"),
16
  Ol(
17
- Li("one"),
18
- Li("two"),
19
  ),
20
  H4("Filtering"),
21
  Ol(
22
- Li("one"),
23
- Li("two"),
24
  ),
25
  H4("Local Deduplication Process"),
26
  Ol(
27
- Li("one"),
28
- Li("two"),
29
  ),
30
  H4("Global Deduplication Process"),
31
  Ol(
32
- Li("one"),
33
- Li("two"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ),
35
 
36
  ),
 
11
 
12
  filtering_process = Div(
13
  Section(
14
+ H3("Wikipedia"),
15
  H4("Download and Extraction"),
16
  Ol(
17
+ Li("Downloaded from Wikimedia official dump of wikipedia on huggingface https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"),
18
+ Li("Data is originally in parqet format so we use huggingface dataset.to_json function to convert it to the jsonl format"),
19
  ),
20
  H4("Filtering"),
21
  Ol(
22
+ Li("As we expect the dataset to be already of high quality so only one filter is applied which is to remove all documents (articles) with less than 10 words (not inclusive)"),
 
23
  ),
24
  H4("Local Deduplication Process"),
25
  Ol(
26
+ Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
 
27
  ),
28
  H4("Global Deduplication Process"),
29
  Ol(
30
+ Li("After local dedup, remaining wikipedia was deduped again with all the datasets combined"),
31
+ ),
32
+
33
+ ),
34
+ Section(
35
+ H3("ArXiv"),
36
+ H4("Download and Extraction"),
37
+ Ol(
38
+ Li("All the data was downloaded in original latex format from Arxiv official S3 dump s3://arxic/src"),
39
+ Li("We try to encode the downloaded data into utf-8 or guess encoding using chardet library"),
40
+ Li("After that pandoc was used to extract information from the latex files and saved as markdown format - code: pandoc -s {tex} -o out/{out_name}.md --wrap=none"),
41
+ Li("All markdowns were combined to create jsonl files"),
42
+ ),
43
+ H4("Filtering"),
44
+ P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset")
45
+ Ol(
46
+ Li("min_word: less than 500 words (not inclusive) are discarded"),
47
+ Li("Language: any language other than English are discarded"),
48
+ Li("Frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
49
+ Li("Unigram log probablity: Must have higher than -20 average unigram log probability. To calculate the average log word probability, we use word frequencies extracted from the 1T Web Ngram corpus; specifically, we use the list available created by Rachel Tatman. A copy is hosted here."),
50
+ Li("number 4 above had hyperlinks that need to be included"),
51
+ ),
52
+ H4("Local Deduplication Process"),
53
+ Ol(
54
+ Li("Local dedup was done with all papers combined."),
55
+ ),
56
+ H4("Global Deduplication Process"),
57
+ Ol(
58
+ Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
59
+ ),
60
+
61
+ ),
62
+ Section(
63
+ H3("S2ORC"),
64
+ H4("Download and Extraction"),
65
+ Ol(
66
+ Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
67
+ Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
68
+ ),
69
+ H4("Filtering - S2ORC"),
70
+ P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset")
71
+ Ol(
72
+ Li("title_abstract: must have title and abstract"),
73
+ Li("The paper must be in English. To determine the language of each document, we use the pycld3 library. We run pycld3 on the first 2000 characters of each paragraph in the paper. The language of the paper is the most common language of the paragraphs."),
74
+ Li("word_count: less than 500 words (not inclusive) are discarded"),
75
+ Li("paragraph_count: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
76
+ Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
77
+ ),
78
+ H4("Filtering - S2ORC Abstract"),
79
+ P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually")
80
+ Ol(
81
+ Li("title_abstract: must have title and abstract"),
82
+ Li("language: abstract must be in English"),
83
+ Li("word_count: less than 20 (not inclusive) are discarded"),
84
+ Li("Unigram log probablity"),
85
+ Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
86
+ ),
87
+ H4("Local Deduplication Process"),
88
+ Ol(
89
+ Li("Local dedup was done with all papers combined."),
90
+ ),
91
+ H4("Global Deduplication Process"),
92
+ Ol(
93
+ Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
94
+ ),
95
+
96
+ ),
97
+ Section(
98
+ H3("PubMed"),
99
+ H4("Download and Extraction"),
100
+ Ol(
101
+ Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
102
+ Li("All the urls are downloaded and the downloaded data is in xml.tar format"),
103
+ Li("For pubmed central First tar files are opened using tarfile library and then converted to markdown format using pandoc: pandoc -f jats {nxml} -o {pmcid}.md --wrap=none"),
104
+ Li("All the markdown files are combined to create jsonl files. In jsonl files, 1 line correspond to 1 markdown file."),
105
+ Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
106
+ ),
107
+ H4("Filtering"),
108
+ P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset.")
109
+ Ol(
110
+ Li("min_word: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
111
+ Li("Language: any language other than English are discarded"),
112
+ Li("Frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace. This filter is not used for pubmed abstract"),
113
+ Li("Unigram log probablity: Must have higher than -20 average unigram log probability. To calculate the average log word probability, we use word frequencies extracted from the 1T Web Ngram corpus; specifically, we use the list available created by Rachel Tatman. A copy is hosted here."),
114
+ Li("need to add the hyperlinks for the section above"),
115
+ ),
116
+ H4("Local Deduplication Process"),
117
+ Ol(
118
+ Li("Local dedup was done with all papers combined."),
119
+ ),
120
+ H4("Global Deduplication Process"),
121
+ Ol(
122
+ Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
123
+ ),
124
+
125
+ ),
126
+ Section(
127
+ H3("Phil Papers"),
128
+ H4("Download and Extraction"),
129
+ Ol(
130
+ Li("Original pdf files download location was downloaded from https://philarchive.org/oai.pl "),
131
+ Li("All pdf files were downloaded"),
132
+ Li("Pdf was converted to text using java -jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}"),
133
+ Li("Language was detected and added using langdetect library"),
134
+ ),
135
+ H4("Filtering"),
136
+ Ol(
137
+ Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
138
+ ),
139
+ H4("Local Deduplication Process"),
140
+ Ol(
141
+ Li("Local dedup was done with all papers combined."),
142
+ ),
143
+ H4("Global Deduplication Process"),
144
+ Ol(
145
+ Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
146
+ ),
147
+
148
+ ),
149
+ Section(
150
+ H3("Europarl"),
151
+ H4("Download and Extraction"),
152
+ Ol(
153
+ Li("Original data was downloaded from http://www.statmt.org/europarl/v7/europarl.tgz"),
154
+ Li("Finally the remaining files are converted to jsonl lines"),
155
+ ),
156
+ H4("Filtering"),
157
+ Ol(
158
+ Li("Smaller than 200 characters of documents are removed while downloading so no others filtered were run"),
159
+ Li("Tags were also removed while downloading"),
160
+ ),
161
+ H4("Local Deduplication Process"),
162
+ Ol(
163
+ Li("Local dedup was done within europarl itself"),
164
+ ),
165
+ H4("Global Deduplication Process"),
166
+ Ol(
167
+ Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
168
+ ),
169
+
170
+ ),
171
+ Section(
172
+ H3("HackerNews"),
173
+ H4("Download and Extraction"),
174
+ Ol(
175
+ Li("Data was parsed using hackernews story ids starting using https://hacker-news.firebaseio.com/v0/item/"),
176
+ Li("Story ids was started from 1 till 37500000 (all stories that gives error while pinging the url was removed). Each post is a story, with each reply another story"),
177
+ Li("As there were too many requests error, there was a wait(2 sec) statement included in the code"),
178
+ Li("As the number of stories were large and containing all the replies was time consuming and possibility of introducing too much error, only longest depth threads were included from 3rd level onwards. So we include the title then all the replies (2nd level) but replies to those replies (3rd level) were only the ones which has maximum depth."),
179
+ ),
180
+ H4("Filtering"),
181
+ Ol(
182
+ Li("Min word: 10"),
183
+ Li("Language: Only english"),
184
+ Li("Unigram log probablity"),
185
+ ),
186
+ H4("Local Deduplication Process"),
187
+ Ol(
188
+ Li("Local dedup was done within hackernews itself"),
189
+ ),
190
+ H4("Global Deduplication Process"),
191
+ Ol(
192
+ Li("After local dedup, remaining data was deduped again with all the datasets combined"),
193
+ ),
194
+
195
+ ),
196
+ Section(
197
+ H3("USPTO"),
198
+ H4("Download and Extraction"),
199
+ Ol(
200
+ Li("Data was downloaded and extracted using tags from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),
201
+ Li("There were three different format that needed three different functions to download and extract the data based on year: Pre_2002, 2002_to_2004, post_2004"),
202
+
203
+ ),
204
+ H4("Filtering"),
205
+ Ol(
206
+ Li("Min word: 50"),
207
+ Li("Language: Only english"),
208
+ Li("Unigram log probablity"),
209
+ ),
210
+ H4("Local Deduplication Process"),
211
+ Ol(
212
+ Li("Local dedup was done within USPTO itself"),
213
+ ),
214
+ H4("Global Deduplication Process"),
215
+ Ol(
216
+ Li("After local dedup, remaining data was deduped again with all the datasets combined"),
217
+ ),
218
+
219
+ ),
220
+ Section(
221
+ H3("FreeLaw"),
222
+ H4("Download and Extraction"),
223
+ Ol(
224
+ Li("CSV format bulk data was downloaded from https://storage.courtlistener.com/bulk-data/"),
225
+ Li("They have multiple dumps as shown below with lot of duplicates (exact number is given in the table at the top)"),
226
+ Li("there is an image to show here!"),
227
+ Li("As these are csv files, they have multiple columns where text can be present, so we extracted text from the following columns using html2text function which just convert and extract tags from html tags"),
228
+ Li("image to show"),
229
+ Li("Text was also extracted from row named 'plain_text'"),
230
+ Li("Priority is always given to plain_text first then from 6 to 1 in the subsequent order following pile logic"),
231
+ ),
232
+ H4("Filtering"),
233
+ Ol(
234
+ Li("Min word: 50"),
235
+ Li("Language: Only english"),
236
+ Li("Unigram log probablity"),
237
+ ),
238
+ H4("Local Deduplication Process"),
239
+ Ol(
240
+ Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
241
+ ),
242
+ H4("Global Deduplication Process"),
243
+ Ol(
244
+ Li("After local dedup, remaining data was deduped again with all the datasets combined"),
245
+ ),
246
+
247
+ ),
248
+ Section(
249
+ H3("StackExchange"),
250
+ H4("Download and Extraction"),
251
+ Ol(
252
+ Li("Archive dump was used to download data from all the stackexchange sub urls, eg., math.stackexchange etc."),
253
+ Li("Raw data is in XML format with lot of metadata. We only used two files Posts.xml and Comments.xml"),
254
+ Li("We parsed using post_id to connect each question to answer and then to comments so our data has same hierarchy as stackexchange UI"),
255
+ Li("""
256
+ 1. Questions:
257
+ 2. Comment1:
258
+ 3. Comment2:
259
+ 4. Answer1:
260
+ 5. Comment1:
261
+ 6. Comment2:
262
+ 7. Answer2:
263
+ 8. Comment1:
264
+ 9. Comment2:
265
+ """),
266
+ ),
267
+ H4("Filtering"),
268
+ Ol(
269
+ Li("Min word: 10"),
270
+ ),
271
+ H4("Local Deduplication Process"),
272
+ Ol(
273
+ Li("Local dedup was done within stackexchange itself"),
274
+ ),
275
+ H4("Global Deduplication Process"),
276
+ Ol(
277
+ Li("After local dedup, remaining data was deduped again with all the datasets combined"),
278
+ ),
279
+
280
+ ),
281
+ Section(
282
+ H3("Ubuntu IRC"),
283
+ H4("Download and Extraction"),
284
+ Ol(
285
+ Li("All the data was downloaded from https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/ based on the year"),
286
+ Li("During extraction, we cleaned the logs using following functions"),
287
+ Li("image here"),
288
+ ),
289
+ H4("Filtering"),
290
+ Ol(
291
+ Li("Min word: 10"),
292
+ Li("Language: Only english"),
293
+ Li("Unigram log probablity"),
294
+ ),
295
+ H4("Local Deduplication Process"),
296
+ Ol(
297
+ Li("Local dedup was done within Ubuntu IRC itself"),
298
+ ),
299
+ H4("Global Deduplication Process"),
300
+ Ol(
301
+ Li("After local dedup, remaining data was deduped again with all the datasets combined"),
302
+ ),
303
+
304
+ ),
305
+ Section(
306
+ H3("DM Maths"),
307
+ H4("Download and Extraction"),
308
+ Ol(
309
+ Li("Directly downloaded from hugging-face dump dm_maths/"),
310
+ Li("Data was converted in jsonl format where each lines are : Question: TEXT Answer: TEXT"),
311
+ ),
312
+ H4("Filtering"),
313
+ Ol(
314
+ Li("None"),
315
+ ),
316
+ H4("Local Deduplication Process"),
317
+ Ol(
318
+ Li("None"),
319
+ ),
320
+ H4("Global Deduplication Process"),
321
+ Ol(
322
+ Li("None"),
323
+ ),
324
+
325
+ ),
326
+ Section(
327
+ H3("PG19"),
328
+ H4("Download and Extraction"),
329
+ Ol(
330
+ Li("Directly downloaded from hugging-face dump pg19/"),
331
+ ),
332
+ H4("Filtering"),
333
+ Ol(
334
+ Li("Min word: 20"),
335
+ Li("Language: ???"),
336
+ Li("Unigram log probablity"),
337
+ ),
338
+ H4("Local Deduplication Process"),
339
+ Ol(
340
+ Li("Local dedup was done within PG19 itself"),
341
+ ),
342
+ H4("Global Deduplication Process"),
343
+ Ol(
344
+ Li("After local dedup, remaining data was deduped again with all the datasets combined"),
345
  ),
346
 
347
  ),