victormiller commited on
Commit
e93fc1a
1 Parent(s): 858c4bf

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +49 -0
curated.py CHANGED
@@ -808,6 +808,54 @@ fig.update_layout(
808
  # Show the plot
809
  stacked_bar = fig
810
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
811
 
812
  def curated(request):
813
 
@@ -943,6 +991,7 @@ def curated(request):
943
  plotly2fasthtml(get_chart_28168342()),
944
  plotly2fasthtml(get_chart_new()),
945
  plotly2fasthtml(stacked_bar),
 
946
  H2("Curated Sources Processing"),
947
  filtering_process,
948
  data_preparation_div,
 
808
  # Show the plot
809
  stacked_bar = fig
810
 
811
+ # Aggregating the data for filters and datasets
812
+ filter_data = {
813
+ 'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
814
+ 'Wikipedia': [0, 1146416, 60468491, 60468491],
815
+ 'Freelaw': [2280522, 5518932, 68171834, 68123174],
816
+ 'DM Maths': [0, 0, 112559888, 112559888],
817
+ 'USPTO': [1312, 129042, 6749922, 6749389],
818
+ 'PG19': [69, 1, 28682, 28632],
819
+ 'Hackernews': [54129, 314, 2010488, 2003636],
820
+ 'Ubuntu IRC': [14465, 33, 23468, 23205],
821
+ 'Europarl': [0, 0, 69814, 69814],
822
+ 'StackExchange': [0, 196, 23246352, 23246352],
823
+ 'Arxiv': [42426, 105601, 1763840, 1762661],
824
+ 'S2ORC': [0, 0, 12963563, 12963563],
825
+ 'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
826
+ 'Pubmed Central': [400446, 62176, 4768310, 4767474],
827
+ 'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
828
+ 'Phil Papers': [10214, 0, 39175, 39128]
829
+ }
830
+
831
+ # Creating a new dataframe for the filter data
832
+ filter_df = pd.DataFrame(filter_data)
833
+
834
+ # Creating the stacked bar chart
835
+ fig = go.Figure()
836
+
837
+ # Add trace for each dataset
838
+ for dataset in filter_df.columns[1:]:
839
+ fig.add_trace(go.Bar(
840
+ name=dataset,
841
+ x=filter_df['Filter'],
842
+ y=filter_df[dataset]
843
+ ))
844
+
845
+ # Update the layout
846
+ fig.update_layout(
847
+ barmode='stack',
848
+ title='Stacked Bar Chart of Filters for Each Dataset',
849
+ xaxis_title='Filter',
850
+ yaxis_title='Number of Lines',
851
+ legend_title='Dataset',
852
+ height=600,
853
+ width=1000
854
+ )
855
+
856
+ # Show the plot
857
+ diff_stacked_bar = fig
858
+
859
 
860
  def curated(request):
861
 
 
991
  plotly2fasthtml(get_chart_28168342()),
992
  plotly2fasthtml(get_chart_new()),
993
  plotly2fasthtml(stacked_bar),
994
+ plotly2fasthtml(diff_stacked_bar),
995
  H2("Curated Sources Processing"),
996
  filtering_process,
997
  data_preparation_div,