victormiller commited on
Commit
e3ed423
1 Parent(s): 913dc7b

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +27 -14
curated.py CHANGED
@@ -595,15 +595,6 @@ filtering_process = Div(
595
  Li("paragraph_count: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
596
  Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
597
  ),
598
- H4("Filtering - S2ORC Abstract"),
599
- P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
600
- Ol(
601
- Li("title_abstract: must have title and abstract"),
602
- Li("language: abstract must be in English"),
603
- Li("word_count: less than 20 (not inclusive) are discarded"),
604
- Li("Unigram log probablity"),
605
- Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
606
- ),
607
  H4("Local Deduplication Process"),
608
  Ol(
609
  Li("Local dedup was done with all papers combined."),
@@ -616,7 +607,7 @@ filtering_process = Div(
616
  Details(
617
  Summary("FreeLaw Filtering Examples -- need to update"),
618
  Div(
619
- freelaw_examples,
620
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
621
  ),
622
  style="""
@@ -628,6 +619,28 @@ filtering_process = Div(
628
  ),
629
  ),
630
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  Section(
632
  Div(
633
  H3("PubMed - need to update with abstract vs central"),
@@ -797,7 +810,7 @@ filtering_process = Div(
797
  H3("FreeLaw"),
798
  P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
799
  H4("Download and Extraction"),
800
- #P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), )#". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
801
  D_code("""
802
  ("html", html2text),
803
  ("html_lawbox", html2text),
@@ -839,7 +852,7 @@ filtering_process = Div(
839
  Div(
840
  H3("StackExchange"),
841
  P("A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post."),
842
- P(B("Download and Extraction: "), "The archive dataset was used to download all data from StackExchange and StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments."),
843
  P("""
844
  1. Questions:
845
  2. Comment1:
@@ -937,9 +950,9 @@ filtering_process = Div(
937
  P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
938
  H4("Filtering"),
939
  Ol(
940
- Li("Language Filter: ???", style = "margin-bottom: 2px"),
941
  Li("Minimum Word Count Filter: 20", style = "margin-bottom: 2px"),
942
- Li("Unigram Log Probability", style = "margin-bottom: 2px"),
943
  ),
944
  table_div_pg19,
945
  Details(
 
595
  Li("paragraph_count: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
596
  Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
597
  ),
 
 
 
 
 
 
 
 
 
598
  H4("Local Deduplication Process"),
599
  Ol(
600
  Li("Local dedup was done with all papers combined."),
 
607
  Details(
608
  Summary("FreeLaw Filtering Examples -- need to update"),
609
  Div(
610
+ P("examples are missing"),
611
  style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
612
  ),
613
  style="""
 
619
  ),
620
  ),
621
  ),
622
+ Section(
623
+ Div(
624
+ H3("S2ORC ABSTRACT"),
625
+ P("The Semantic Scholar Open Research Corpus (S2ORC) is a comprehensive dataset designed for natural language processing (NLP) and text-mining research over scientific papers. It includes rich metadata, and abstract and full-text content for millions of academic papers across various disciplines. This dataset is further divided into two components, S2ORC abstract and S2ORC full text."),
626
+ H4("Download and Extraction"),
627
+ Ol(
628
+ Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
629
+ Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
630
+ ),
631
+ H4("Filtering - S2ORC Abstract"),
632
+ P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
633
+ Ol(
634
+ Li("title_abstract: must have title and abstract"),
635
+ Li("language: abstract must be in English"),
636
+ Li("word_count: less than 20 (not inclusive) are discarded"),
637
+ Li("Unigram log probablity"),
638
+ Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
639
+ ),
640
+ )
641
+ ),
642
+
643
+
644
  Section(
645
  Div(
646
  H3("PubMed - need to update with abstract vs central"),
 
810
  H3("FreeLaw"),
811
  P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
812
  H4("Download and Extraction"),
813
+ P("The dataset was downloaded from:", A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), ". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
814
  D_code("""
815
  ("html", html2text),
816
  ("html_lawbox", html2text),
 
852
  Div(
853
  H3("StackExchange"),
854
  P("A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more. This is one of the largest publicly available repositories for question-answer pairs. We have included comments also to include an overall discussion on each post."),
855
+ P(B("Download and Extraction: "), "The archive dataset was used to download all data from StackExchange and 364 StackExchange's sub URLs including: ", A("math.stackexchange.com", href="math.stackexchange.com"),". Raw data was extracted an XML format and only two files Posts.xml and Comments.xml were considered. To match the StackExchange hierarchy, each file was parsed using post_id to connect questions to answers and then to comments. We will include the full list of sub URLs in when the code is released."),
856
  P("""
857
  1. Questions:
858
  2. Comment1:
 
950
  P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface:", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
951
  H4("Filtering"),
952
  Ol(
953
+ Li("Language Filter: English", style = "margin-bottom: 2px"),
954
  Li("Minimum Word Count Filter: 20", style = "margin-bottom: 2px"),
955
+ Li("Unigram Log Probability: ", "-20", style = "margin-bottom: 2px"),
956
  ),
957
  table_div_pg19,
958
  Details(