victormiller commited on
Commit
10a8615
1 Parent(s): d293ab8

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +48 -0
curated.py CHANGED
@@ -694,6 +694,53 @@ def get_chart_28168342():
694
  return fig
695
 
696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  def update(target: str, request):
698
  params = request.query_params
699
  if data_source := params.get(f"data_source_{target}"):
@@ -836,6 +883,7 @@ def curated(request):
836
  table_desc,
837
  data_preprocessing_div,
838
  plotly2fasthtml(get_chart_28168342()),
 
839
  H2("Curated Sources Processing"),
840
  filtering_process,
841
  data_preparation_div,
 
694
  return fig
695
 
696
 
697
+ def get_chart_new():
698
+ fig = go.Figure()
699
+ filter_names = [
700
+ "Download",
701
+ "Language",
702
+ "Min word count",
703
+ "Title Abstract",
704
+ "Majority language",
705
+ "Paragraph count",
706
+ "Frequency",
707
+ "Unigram log probability",
708
+ "Local dedup",
709
+ ]
710
+
711
+ data_sources = [
712
+ ("Wikipedia", [61614907, 0, 1146416, 0, 0, 0, 0, 0, 20]),
713
+ ("Freelaw", [75971288, 2280522, 5518932, 0, 0, 0, 0, 48660, 20]),
714
+ ("DM Maths", [112559888, 0, 0, 0, 0, 0, 0, 0, 20]),
715
+ ("USPTO", [6880276, 1312, 129042, 0, 0, 0, 0, 533, 20]),
716
+ ("PG19", [28752, 69, 1, 0, 0, 0, 0, 50, 20]),
717
+ ("Hackernews", [2064931, 54129, 314, 0, 0, 0, 0, 6852, 20]),
718
+ ("Ubuntu IRC", [37966, 14465, 33, 0, 0, 0, 0, 263, 20]),
719
+ ("Europarl", [69814, 0, 0, 0, 0, 0, 0, 0, 20]),
720
+ ("StackExchange", [23246548, 0, 196, 0, 0, 0, 0, 0, 20]),
721
+ ("Arxiv", [1911867, 42426, 105601, 0, 0, 0, 0, 1179, 20]),
722
+ ("S2ORC", [12963563, 0, 0, 2232450, 1275493, 148804, 1251669, 0, 20]),
723
+ ("S2ORC Abstract", [102324176, 18456575, 978308, 0, 0, 0, 0, 111381, 20]),
724
+ ("PubMed Central", [5230932, 400446, 62176, 0, 0, 0, 0, 836, 20]),
725
+ ("PubMed Central Abstract", [25787474, 3100, 36419, 0, 0, 0, 0, 1231, 20]),
726
+ ("PhilPapers", [49389, 10214, 0, 0, 0, 0, 0, 47, 20]),
727
+ ]
728
+
729
+ for name, x_values in data_sources:
730
+ fig.add_trace(
731
+ go.Funnel(
732
+ name=name,
733
+ orientation="h",
734
+ y=filter_names,
735
+ x=x_values,
736
+ textinfo="value+percent total",
737
+ textposition="inside",
738
+ )
739
+ )
740
+
741
+ fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
742
+ return fig
743
+
744
  def update(target: str, request):
745
  params = request.query_params
746
  if data_source := params.get(f"data_source_{target}"):
 
883
  table_desc,
884
  data_preprocessing_div,
885
  plotly2fasthtml(get_chart_28168342()),
886
+ plotly2fasthtml(get_chart_new()),
887
  H2("Curated Sources Processing"),
888
  filtering_process,
889
  data_preparation_div,