victormiller commited on
Commit
3d4aecc
1 Parent(s): 1630e9d

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +61 -63
curated.py CHANGED
@@ -12,7 +12,7 @@ import plotly.express as px
12
  from fasthtml.components import D_code
13
 
14
  overview = Div(
15
- H2("Curated Source Processing Overview"),
16
  H3("What This Section Contains"),
17
  P("This section provides a complete discussion on the filtering applied to the 14 curated sources that comprise the non-web data section of TxT360. The section is split into the following topic areas: "),
18
  Ul(
@@ -21,8 +21,12 @@ overview = Div(
21
  ),
22
  ),
23
 
24
- overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity.", B(" We were strongly influenced by The Pile regarding both inclusion of the dataset and filtering techniques."), " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
25
- copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in The Pile like YouTube and Opensubtitles, Reddit threads, and books.")
 
 
 
 
26
 
27
  treemap_data = {
28
  'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
@@ -449,14 +453,12 @@ eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
449
  ## end filtered examples
450
 
451
 
 
452
  data_preprocessing_div = Div(
453
- H2("Data Preprocessing"),
454
  P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
455
-
456
  P("The ", B("Language Filter"), " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
457
- H3("Minimum Word Count Filter"),
458
  P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
459
-
460
  P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
461
  H3("Data Processing for S2ORC"),
462
  P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
@@ -466,11 +468,61 @@ data_preprocessing_div = Div(
466
  P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
467
  )
468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
 
471
  filtering_process = Div(
472
  Section(
473
- H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
 
 
 
474
  ),
475
  Section(
476
  Div(
@@ -1006,55 +1058,6 @@ def update(target: str, request):
1006
  return get_data(
1007
  params.get(f"data_source_{target}"), doc_id, target)
1008
 
1009
- # Data for the stacked bar chart
1010
- data = {
1011
- 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
1012
- 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
1013
- 'Freelaw': [75971288, 73690766, 68171834, 68123174],
1014
- 'DM Maths': [112559888, 112559888, 112559888, 112559888],
1015
- 'USPTO': [6880276, 6878964, 6749922, 6749389],
1016
- 'PG19': [28752, 28683, 28682, 28632],
1017
- 'Hackernews': [2064931, 2010802, 2010488, 2003636],
1018
- 'Ubuntu IRC': [37966, 23501, 23468, 23205],
1019
- 'Europarl': [69814, 69814, 69814, 69814],
1020
- 'StackExchange': [23246548, 23246548, 23246352, 23246352],
1021
- 'Arxiv': [1911867, 1869441, 1763840, 1762661],
1022
- 'S2ORC': [12963563, 12963563, 12963563, 12963563],
1023
- 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
1024
- 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
1025
- 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
1026
- 'Phil Papers': [49389, 39175, 39175, 39128]
1027
- }
1028
-
1029
- # Creating a dataframe
1030
- df = pd.DataFrame(data)
1031
-
1032
- # Creating the stacked bar chart
1033
- fig = go.Figure()
1034
-
1035
- # Add trace for each dataset
1036
- for dataset in df.columns[1:]:
1037
- fig.add_trace(go.Bar(
1038
- name=dataset,
1039
- x=df['Filter'],
1040
- y=df[dataset]
1041
- ))
1042
-
1043
- # Update the layout
1044
- fig.update_layout(
1045
- barmode='stack',
1046
- title='Document Reduction by Filter for Each Dataset',
1047
- xaxis_title='Filter',
1048
- yaxis_title='Number of Lines',
1049
- legend_title='Dataset',
1050
- height=600,
1051
- width=1000
1052
- )
1053
-
1054
- # Show the plot
1055
- diff2_stacked_bar = fig
1056
-
1057
-
1058
 
1059
  def curated(request):
1060
 
@@ -1118,14 +1121,9 @@ def curated(request):
1118
 
1119
  return Div(
1120
  overview,
1121
- H2("Curated Sources: Overview"),
1122
- overview_text,
1123
- copyright_disclaimer,
1124
  plotly2fasthtml(treemap_chart),
1125
  data_preprocessing_div,
1126
- H2("Curated Sources Processing"),
1127
- plotly2fasthtml(diff2_stacked_bar),
1128
- P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
1129
  filtering_process,
1130
  #data_preparation_div,
1131
  #H2("Local Deduplication"), are these numbers even right?
 
12
  from fasthtml.components import D_code
13
 
14
  overview = Div(
15
+ H2("Curated Sources Processing"),
16
  H3("What This Section Contains"),
17
  P("This section provides a complete discussion on the filtering applied to the 14 curated sources that comprise the non-web data section of TxT360. The section is split into the following topic areas: "),
18
  Ul(
 
21
  ),
22
  ),
23
 
24
+ curated_sources_intro = Div(
25
+ H2("Curated Sources in TxT360"),
26
+ P("Curated sources comprise high-quality datasets that contain domain-specificity.", B(" TxT360 was strongly influenced by The Pile regarding both inclusion of the dataset and filtering techniques."), " These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. "),
27
+ P("TxT360 respects the copyright of the data sources and have not included the controversial data that was used in The Pile like YouTube and Opensubtitles, Reddit threads, and books."),
28
+ )
29
+
30
 
31
  treemap_data = {
32
  'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
 
453
  ## end filtered examples
454
 
455
 
456
+
457
  data_preprocessing_div = Div(
458
+ H2("Filtering Steps and Definitions"),
459
  P("Data preprocessing is a crucial step in the data science pipeline. It involves cleaning and transforming raw data into a format that is suitable for analysis. This process includes handling missing values, normalizing data, encoding categorical variables, and more."),
 
460
  P("The ", B("Language Filter"), " removes documents in unwanted languages. This step improves data quality by removing irrelevant documents."),
 
461
  P("The ", B("Minimum Word Count Filter")," sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each datasource."),
 
462
  P("The ", B("Unigram Log Probability Filter")," calculates the log probability of each unigram to measure the significance of individual words. This step quantifies the importance of individual words but maay not capture the semantic meaning of words."),
463
  H3("Data Processing for S2ORC"),
464
  P("The formating of the S2ORC dataset required special filters to be applied. These filters were not applied to the other data sources."),
 
468
  P("The ",B("Frequency Filter")," calculates the frequency of each word in the dataset. This step serves to identify important words and topics in the dataset but may be sensitive to noise and outliers."),
469
  )
470
 
471
+ # Data for the stacked bar chart
472
+ data = {
473
+ 'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
474
+ 'Wikipedia': [61614907, 61614907, 60468491, 60468491],
475
+ 'Freelaw': [75971288, 73690766, 68171834, 68123174],
476
+ 'DM Maths': [112559888, 112559888, 112559888, 112559888],
477
+ 'USPTO': [6880276, 6878964, 6749922, 6749389],
478
+ 'PG19': [28752, 28683, 28682, 28632],
479
+ 'Hackernews': [2064931, 2010802, 2010488, 2003636],
480
+ 'Ubuntu IRC': [37966, 23501, 23468, 23205],
481
+ 'Europarl': [69814, 69814, 69814, 69814],
482
+ 'StackExchange': [23246548, 23246548, 23246352, 23246352],
483
+ 'Arxiv': [1911867, 1869441, 1763840, 1762661],
484
+ 'S2ORC': [12963563, 12963563, 12963563, 12963563],
485
+ 'S2ORC Abstract': [102324176, 83867601, 82889293, 82777912],
486
+ 'Pubmed Central': [5230932, 4830486, 4768310, 4767474],
487
+ 'Pubmed Abstract': [25787474, 25784374, 25747955, 25746724],
488
+ 'Phil Papers': [49389, 39175, 39175, 39128]
489
+ }
490
+
491
+ # Creating a dataframe
492
+ df = pd.DataFrame(data)
493
+
494
+ # Creating the stacked bar chart
495
+ fig = go.Figure()
496
+
497
+ # Add trace for each dataset
498
+ for dataset in df.columns[1:]:
499
+ fig.add_trace(go.Bar(
500
+ name=dataset,
501
+ x=df['Filter'],
502
+ y=df[dataset]
503
+ ))
504
+
505
+ # Update the layout
506
+ fig.update_layout(
507
+ barmode='stack',
508
+ title='Document Reduction by Filter for Each Dataset',
509
+ xaxis_title='Filter',
510
+ yaxis_title='Number of Lines',
511
+ legend_title='Dataset',
512
+ height=600,
513
+ width=1000
514
+ )
515
+
516
+ # Show the plot
517
+ diff2_stacked_bar = fig
518
 
519
 
520
  filtering_process = Div(
521
  Section(
522
+ H2("Discussion on Filtering All Curated Sources")
523
+ P("Below is a detail recount of how each dataset was extracted and filtered. If specific challenges were found with a dataset, they are included and discussed to the best of our abilities. The figure below provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
524
+ plotly2fasthtml(diff2_stacked_bar),
525
+ H3("This section continues belows with the specific filtering steps taken for all 14 curated datasets."),
526
  ),
527
  Section(
528
  Div(
 
1058
  return get_data(
1059
  params.get(f"data_source_{target}"), doc_id, target)
1060
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
 
1062
  def curated(request):
1063
 
 
1121
 
1122
  return Div(
1123
  overview,
1124
+ curated_sources_intro,
 
 
1125
  plotly2fasthtml(treemap_chart),
1126
  data_preprocessing_div,
 
 
 
1127
  filtering_process,
1128
  #data_preparation_div,
1129
  #H2("Local Deduplication"), are these numbers even right?