victormiller commited on
Commit
25a9fcb
·
verified ·
1 Parent(s): c2f326c

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +57 -39
curated.py CHANGED
@@ -9,6 +9,57 @@ from rich import print
9
  import uuid
10
  import plotly.express as px
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  filtering_process = Div(
13
  Section(
14
  P("This section contains the specific steps taken to filter all 14 curated source datasets.")
@@ -353,45 +404,11 @@ filtering_process = Div(
353
 
354
 
355
 
356
- overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
357
- copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
358
 
359
- local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
360
 
361
- treemap_data = {
362
- 'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
363
- 'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
364
- 'Count': [100, 200, 150, 120, 80, 90, 300, 250, 180, 150, 150, 250, 180, 120, 90],
365
- 'Details': [
366
- 'A repository of scientific papers in various disciplines, including computer science, physics, mathematics, and more.',
367
- 'A database of biomedical and life sciences research articles.',
368
- 'Abstracts of biomedical literature from various sources.',
369
- 'Full-text articles from the Semantic Scholar Open Research Corpus.',
370
- 'Abstracts of articles from the Semantic Scholar Open Research Corpus.',
371
- 'Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research.',
372
- 'A collaborative online encyclopedia that covers a wide range of topics.',
373
- 'A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more.',
374
- 'A collection of multilingual parallel corpora of parliamentary debates from the European Parliament.',
375
- 'Chat logs from the Ubuntu Internet Relay Chat (IRC) channels.',
376
- 'Legal documents and court cases from various jurisdictions.',
377
- 'A collection of books from Project Gutenberg, a digital library of public domain works.',
378
- 'Patent documents from the United States Patent and Trademark Office.',
379
- 'User-generated news and discussion platform focused on technology and startups.',
380
- 'Deep Mind Maths dataset with generated questions.'
381
- ]
382
- }
383
- # Calculate percentage for each data source
384
- total_count = sum(treemap_data['Count'])
385
- treemap_data['Percentage'] = [count / total_count * 100 for count in treemap_data['Count']]
386
-
387
- # Create treemap
388
- fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hover_data=['Details', 'Percentage'], hover_name='Source')
389
-
390
- # Set the size of the chart
391
 
392
 
393
- # Display treemap if you want to update the size.update_layout(width=800, height=600)
394
- treemap_chart = fig
395
 
396
 
397
 
@@ -743,7 +760,7 @@ def curated(request):
743
  or modules dedicated to the dataset.""")
744
 
745
  data_preparation_div = Div(
746
- H3("Data Preparation"),
747
  text,
748
  table_div,
749
  Div(
@@ -812,17 +829,18 @@ def curated(request):
812
  data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
813
 
814
  return Div(
 
815
  H2("Curated Sources: Overview"),
816
  overview_text,
817
  copyright_disclaimer,
818
  plotly2fasthtml(treemap_chart),
 
819
  table_desc,
 
 
820
  H2("Curated Sources Processing"),
821
  filtering_process,
822
  data_preparation_div,
823
- H3("Data Filtering"),
824
- data_preprocessing_div,
825
- plotly2fasthtml(get_chart_28168342()),
826
  H2("Local Deduplication"),
827
  local_dedup_text,
828
  table_div_data_pipe,
 
9
  import uuid
10
  import plotly.express as px
11
 
12
+
13
+ overview = Div(
14
+ H2("Curated Source Processing Overview"),
15
+ H3("What This Section Contains"),
16
+ P("This section provides a complete discussion on the filtering applied to the 14 curated sources that comprise the non-web data section of TxT360. The section is split into the following topic areas: "),
17
+ Ul(
18
+ Li("Curated Sources Data Processing Summary", style = "margin-bottom: 5px"),
19
+ Li("Individual Filtering Discussion for Each Source", style = "margin-bottom: 5px"),
20
+ ),
21
+ ),
22
+
23
+ overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
24
+ copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
25
+
26
+ treemap_data = {
27
+ 'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
28
+ 'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
29
+ 'Count': [100, 200, 150, 120, 80, 90, 300, 250, 180, 150, 150, 250, 180, 120, 90],
30
+ 'Details': [
31
+ 'A repository of scientific papers in various disciplines, including computer science, physics, mathematics, and more.',
32
+ 'A database of biomedical and life sciences research articles.',
33
+ 'Abstracts of biomedical literature from various sources.',
34
+ 'Full-text articles from the Semantic Scholar Open Research Corpus.',
35
+ 'Abstracts of articles from the Semantic Scholar Open Research Corpus.',
36
+ 'Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research.',
37
+ 'A collaborative online encyclopedia that covers a wide range of topics.',
38
+ 'A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more.',
39
+ 'A collection of multilingual parallel corpora of parliamentary debates from the European Parliament.',
40
+ 'Chat logs from the Ubuntu Internet Relay Chat (IRC) channels.',
41
+ 'Legal documents and court cases from various jurisdictions.',
42
+ 'A collection of books from Project Gutenberg, a digital library of public domain works.',
43
+ 'Patent documents from the United States Patent and Trademark Office.',
44
+ 'User-generated news and discussion platform focused on technology and startups.',
45
+ 'Deep Mind Maths dataset with generated questions.'
46
+ ]
47
+ }
48
+ # Calculate percentage for each data source
49
+ total_count = sum(treemap_data['Count'])
50
+ treemap_data['Percentage'] = [count / total_count * 100 for count in treemap_data['Count']]
51
+
52
+ # Create treemap
53
+ fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hover_data=['Details', 'Percentage'], hover_name='Source')
54
+
55
+ # Set the size of the chart
56
+
57
+
58
+ # Display treemap if you want to update the size.update_layout(width=800, height=600)
59
+ treemap_chart = fig
60
+
61
+
62
+
63
  filtering_process = Div(
64
  Section(
65
  P("This section contains the specific steps taken to filter all 14 curated source datasets.")
 
404
 
405
 
406
 
 
 
407
 
 
408
 
409
+ local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
 
 
 
412
 
413
 
414
 
 
760
  or modules dedicated to the dataset.""")
761
 
762
  data_preparation_div = Div(
763
+ H2("Data Preparation"),
764
  text,
765
  table_div,
766
  Div(
 
829
  data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
830
 
831
  return Div(
832
+ overview
833
  H2("Curated Sources: Overview"),
834
  overview_text,
835
  copyright_disclaimer,
836
  plotly2fasthtml(treemap_chart),
837
+ H2("Curated Sources Defined")
838
  table_desc,
839
+ data_preprocessing_div,
840
+ plotly2fasthtml(get_chart_28168342()),
841
  H2("Curated Sources Processing"),
842
  filtering_process,
843
  data_preparation_div,
 
 
 
844
  H2("Local Deduplication"),
845
  local_dedup_text,
846
  table_div_data_pipe,