victormiller commited on
Commit
9c1b63e
1 Parent(s): ab1848a

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +45 -7
curated.py CHANGED
@@ -551,6 +551,9 @@ filtering_process = Div(
551
  H3("ArXiv"),
552
  P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
553
  P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
 
 
 
554
  P(B(" Filters Applied: "), "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
555
  Ul(
556
  Li("Language Filter: any language other than English are discarded", style = "margin-bottom: -3px"),
@@ -639,6 +642,10 @@ filtering_process = Div(
639
  Div(
640
  H3("PubMed Central and PubMed Abstract"),
641
  P(B("Download and Extraction: "), "All files were downloaded from", A("ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc", D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format."),
 
 
 
 
642
  P(B("Filters Applied: "), "Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
643
  Ul(
644
  Li("Minimum Word Count Filter: PMC documents with less than 100 words (not inclusive) are discarded; PMA documents less than 20 words are discarded", style = "margin-bottom: -3px"),
@@ -699,7 +706,7 @@ filtering_process = Div(
699
  ),
700
  Section(
701
  Div(
702
- H3("Europarl"),
703
  P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
704
  P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
705
  P(B("Filters Applied: ") ,"EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained HTML tags which were removed."),
@@ -750,6 +757,11 @@ filtering_process = Div(
750
  P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
751
  P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
752
  P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
 
 
 
 
 
753
  P(B("Filters Applied: ")),
754
  Ul(
755
  Li("Language Filter: English", style = "margin-bottom: -3px"),
@@ -779,15 +791,20 @@ filtering_process = Div(
779
  P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
780
  P(B("Download and Extraction"), "The dataset was downloaded from:", A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), ". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
781
  D_code("""
782
- ("html", html2text),
783
- ("html_lawbox", html2text),
784
- ("html_columbia", html2text),
785
- ("html_anon_2020", html2text),
786
- ("html_with_citations", html2text),
787
- ("xml_harvard", html2text),
788
  plain_text
789
  """, language ="SQL"),
790
  P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
 
 
 
 
 
 
 
 
791
  P(B("Filters Applied: ")),
792
  Ul(
793
  Li("Language Filter: English", style = "margin-bottom: -3px"),
@@ -828,6 +845,12 @@ filtering_process = Div(
828
  8. Comment1:
829
  9. Comment2:
830
  """),
 
 
 
 
 
 
831
  P(B("Filters Applied: ")),
832
  Ul(
833
  Li("Minimum Word Count Filter: 10", style = "margin-bottom: -3px"),
@@ -866,6 +889,10 @@ filtering_process = Div(
866
  def clean(x):
867
  return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
868
  """, block="block", language="python" ),
 
 
 
 
869
  P(B("Filters Applied: ")),
870
  Ul(
871
  Li("Language Filter: English", style = "margin-bottom: -3px"),
@@ -883,6 +910,11 @@ filtering_process = Div(
883
  D_code("""
884
  Question: TEXT
885
  Answer: TEXT""", block="block", language="python"),
 
 
 
 
 
886
  P(B("Filters Applied: ")),
887
  Ul(
888
  Li("No filtering was applied to DM Math", style = "margin-bottom: -3px"),
@@ -908,6 +940,12 @@ filtering_process = Div(
908
  H3("PG-19"),
909
  P("A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919."),
910
  P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface: ", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
 
 
 
 
 
 
911
  P(B("Filters Applied:")),
912
  Ul(
913
  Li("Language Filter: English", style = "margin-bottom: -3px"),
 
551
  H3("ArXiv"),
552
  P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
553
  P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
554
+ Ul(
555
+ Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
556
+ ),
557
  P(B(" Filters Applied: "), "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
558
  Ul(
559
  Li("Language Filter: any language other than English are discarded", style = "margin-bottom: -3px"),
 
642
  Div(
643
  H3("PubMed Central and PubMed Abstract"),
644
  P(B("Download and Extraction: "), "All files were downloaded from", A("ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc", D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format."),
645
+ P(B("Unique Data Preperation Challenges: ")),
646
+ Ul(
647
+ Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
648
+ ),
649
  P(B("Filters Applied: "), "Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
650
  Ul(
651
  Li("Minimum Word Count Filter: PMC documents with less than 100 words (not inclusive) are discarded; PMA documents less than 20 words are discarded", style = "margin-bottom: -3px"),
 
706
  ),
707
  Section(
708
  Div(
709
+ H3("EuroParl"),
710
  P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
711
  P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
712
  P(B("Filters Applied: ") ,"EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained HTML tags which were removed."),
 
757
  P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
758
  P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
759
  P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
760
+ P(B("Unique Data Preperation Challenges: ")),
761
+ Ul(
762
+ Li("As discussed above, the comment heirarchies required a thoughful approach to extracting meaningful data. ", style = "margin-bottom: -3px"),
763
+ Li("In the comment thread heirarchy, relationships had to be assigned to between the comments, sub-comments, and original story ID. ", style = "margin-bottom: -3px"),
764
+ ),
765
  P(B("Filters Applied: ")),
766
  Ul(
767
  Li("Language Filter: English", style = "margin-bottom: -3px"),
 
791
  P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
792
  P(B("Download and Extraction"), "The dataset was downloaded from:", A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), ". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
793
  D_code("""
794
+ ("html", html2text), ("html_lawbox", html2text),
795
+ ("html_columbia", html2text), ("html_anon_2020", html2text),
796
+ ("html_with_citations", html2text), ("xml_harvard", html2text),
 
 
 
797
  plain_text
798
  """, language ="SQL"),
799
  P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
800
+ P(B("Unique Data Preperation Challenges: ")),
801
+ Ul(
802
+ Li("Consecutive whitespaces and tabs were found. Consecutive Whitespaces and tabes were reduce to one, single whitespace.", style = "margin-bottom: -3px"),
803
+ Li("Whitespaces were found between new lines with no addition text. These whitespaces were removed.", style = "margin-bottom: -3px"),
804
+ Li("Consecutive new lines were found in some documents without leading to a new paragraph. All consecutive newline to a single new line.", style = "margin-bottom: -3px"),
805
+ Li("Converted all single new lines to whitespace. If whitespace was found after a new line with no text, the whitespace was removed. All leading and trailing whitespace was removed.", style = "margin-bottom: -3px"),
806
+ Li("All \f characters were removed.", style = "margin-bottom: -3px"),
807
+ ),
808
  P(B("Filters Applied: ")),
809
  Ul(
810
  Li("Language Filter: English", style = "margin-bottom: -3px"),
 
845
  8. Comment1:
846
  9. Comment2:
847
  """),
848
+ P(B("Unique Data Preperation Challenges: ")),
849
+ Ul(
850
+ Li("Handling code block was a required finding the specific blocks and exacting the details in one snippet.", style = "margin-bottom: -3px"),
851
+ Li("Question and Answer formatting had to be rewritten to match the question and the anwer.", style = "margin-bottom: -3px"),
852
+ Li("Occasionally a title was not included at the beginning of a question. For consistent formatting, a title was added.", style = "margin-bottom: -3px"),
853
+ ),
854
  P(B("Filters Applied: ")),
855
  Ul(
856
  Li("Minimum Word Count Filter: 10", style = "margin-bottom: -3px"),
 
889
  def clean(x):
890
  return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
891
  """, block="block", language="python" ),
892
+ P(B("Unique Data Preperation Challenges: ")),
893
+ Ul(
894
+ Li("Similar to the HackerNews challenges, we had to map comments and sub-comments to the original question.", style = "margin-bottom: -3px"),
895
+ ),
896
  P(B("Filters Applied: ")),
897
  Ul(
898
  Li("Language Filter: English", style = "margin-bottom: -3px"),
 
910
  D_code("""
911
  Question: TEXT
912
  Answer: TEXT""", block="block", language="python"),
913
+ P(B("Unique Data Preperation Challenges: ")),
914
+ Ul(
915
+ Li("A byte string was included at the beginning of new lines", style = "margin-bottom: -3px"),
916
+ Li('No space before keyword "Answer:"', style = "margin-bottom: -3px"),
917
+ ),
918
  P(B("Filters Applied: ")),
919
  Ul(
920
  Li("No filtering was applied to DM Math", style = "margin-bottom: -3px"),
 
940
  H3("PG-19"),
941
  P("A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919."),
942
  P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface: ", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
943
+ P(B("Unique Data Preperation Challenges: ")),
944
+ Ul(
945
+ Li("Consecutive whitespaces were found spanning 10+ whitespace entries. These whitespaces were reduce to one, single whitespace.", style = "margin-bottom: -3px"),
946
+ Li("Consecutive new lines were found in some documents. All consecutive news over two were were reduce to two new lines.", style = "margin-bottom: -3px"),
947
+ Li("Delimiters such as * * * * * * * * ? were found. They were removed and replaced with whitespace.", style = "margin-bottom: -3px"),
948
+ ),
949
  P(B("Filters Applied:")),
950
  Ul(
951
  Li("Language Filter: English", style = "margin-bottom: -3px"),