victormiller commited on
Commit
1630e9d
·
verified ·
1 Parent(s): d4c2068

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +23 -23
curated.py CHANGED
@@ -21,8 +21,8 @@ overview = Div(
21
  ),
22
  ),
23
 
24
- overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
25
- copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
26
 
27
  treemap_data = {
28
  'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
@@ -449,6 +449,25 @@ eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
449
  ## end filtered examples
450
 
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  filtering_process = Div(
453
  Section(
454
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
@@ -488,7 +507,7 @@ filtering_process = Div(
488
  Ol(
489
  Li("Language Filter: any language other than English are discarded"),
490
  Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
491
- Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
492
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
493
  ),
494
  table_div_arx,
@@ -1095,26 +1114,7 @@ def curated(request):
1095
 
1096
 
1097
 
1098
- data_preprocessing_div = Div(
1099
- H2("Data Preprocessing"),
1100
- P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
1101
- H3("Language Filter"),
1102
- P("The Language Filter removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
1103
- H3("Minimum Word Count Filter"),
1104
- P("The Minimum Word Count Filter sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
1105
- H3("Unigram Log Probability"),
1106
- P("The Unigram Log Probability Filter calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
1107
- H2("Data Processing for S2ORC"),
1108
- P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
1109
- H3("Title Abstract Filter"),
1110
- P("The Title Abstract Filter extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
1111
- H3("Majority Language Filter"),
1112
- P("The Majority Language Filter identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
1113
- H3("Paragraph Count Filter"),
1114
- P("The Paragraph Count Filter counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
1115
- H3("Frequency Filter"),
1116
- P("The Frequency Filter calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
1117
- )
1118
 
1119
  return Div(
1120
  overview,
 
21
  ),
22
  ),
23
 
24
+ overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity.", B(" We were strongly influenced by The Pile regarding both inclusion of the dataset and filtering techniques."), " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
25
+ copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in The Pile like YouTube and Opensubtitles, Reddit threads, and books.")
26
 
27
  treemap_data = {
28
  'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
 
449
  ## end filtered examples
450
 
451
 
452
+ data_preprocessing_div = Div(
453
+ H2("Data Preprocessing"),
454
+ P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
455
+
456
+ P("The ", B("Language Filter"), " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
457
+ H3("Minimum Word Count Filter"),
458
+ P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
459
+
460
+ P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
461
+ H3("Data Processing for S2ORC"),
462
+ P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
463
+ P("The ", B("Title Abstract Filter")," extracts information from the title and abstract. This step provides additional information for analysis but may introduce bias in the analysis."),
464
+ P("The ", B("Majority Language Filter")," identifies the majority language in the dataset. This step displays the distribution of languages in the dataset to enable language-specific analysis and insights."),
465
+ P("The ", B("Paragraph Count Filter")," counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful hueristic for document complexity."),
466
+ P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
467
+ )
468
+
469
+
470
+
471
  filtering_process = Div(
472
  Section(
473
  H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
 
507
  Ol(
508
  Li("Language Filter: any language other than English are discarded"),
509
  Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
510
+ Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by ", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
511
  Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
512
  ),
513
  table_div_arx,
 
1114
 
1115
 
1116
 
1117
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
 
1119
  return Div(
1120
  overview,