omwdataset

Runtime error

App Files Files Community

victormiller commited on Oct 1, 2024

Commit

858c4bf

verified ·

1 Parent(s): 10a8615

Update curated.py

Browse files

Files changed (1) hide show

curated.py +60 -1

curated.py CHANGED Viewed

@@ -9,7 +9,6 @@ from rich import print
 import uuid
 import plotly.express as px
 overview = Div(
             H2("Curated Source Processing Overview"),
             H3("What This Section Contains"),
@@ -751,6 +750,65 @@ def update(target: str, request):
             params.get(f"data_source_{target}"), doc_id, target)
 def curated(request):
     # Partial Updates
@@ -884,6 +942,7 @@ def curated(request):
             data_preprocessing_div,
             plotly2fasthtml(get_chart_28168342()),
             plotly2fasthtml(get_chart_new()),
             H2("Curated Sources Processing"),
             filtering_process,
             data_preparation_div,

 import uuid
 import plotly.express as px
 overview = Div(
             H2("Curated Source Processing Overview"),
             H3("What This Section Contains"),
             params.get(f"data_source_{target}"), doc_id, target)
+# Creating the dataframe from the provided table data
+data = {
+    'Dataset': ['Wikipedia', 'Freelaw', 'DM Maths', 'USPTO', 'PG19', 'Hackernews', 'Ubuntu IRC', 'Europarl',
+                'StackExchange', 'Arxiv', 'S2ORC', 'S2ORC Abstract', 'Pubmed Central', 'Pubmed Abstract', 'Phil Papers'],
+    'Downloaded Lines': [61614907, 75971288, 112559888, 6880276, 28752, 2064931, 37966, 69814, 23246548, 1911867,
+                         12963563, 102324176, 5230932, 25787474, 49389],
+    'Language Filter': [0, 2280522, 0, 1312, 69, 54129, 14465, 0, 0, 42426, 0, 18456575, 400446, 3100, 10214],
+    'Min Word Count': [1146416, 5518932, 0, 129042, 1, 314, 33, 0, 196, 105601, 0, 978308, 62176, 36419, 0],
+    'Unigram log probability': [60468491, 68171834, 112559888, 6749922, 28682, 2010488, 23468, 69814, 23246352,
+                                1763840, 12963563, 82889293, 4768310, 25747955, 39175],
+    'Total Lines Remaining': [60468491, 68123174, 112559888, 6749389, 28632, 2003636, 23205, 69814, 23246352,
+                              1762661, 12963563, 82777912, 4767474, 25746724, 39128]
+}
+df = pd.DataFrame(data)
+# Create the stacked bar chart
+fig = go.Figure()
+# Adding traces for each filter stage
+fig.add_trace(go.Bar(
+    name='Language Filter',
+    x=df['Dataset'],
+    y=df['Language Filter']
+))
+fig.add_trace(go.Bar(
+    name='Min Word Count Filter',
+    x=df['Dataset'],
+    y=df['Min Word Count']
+))
+fig.add_trace(go.Bar(
+    name='Unigram log probability Filter',
+    x=df['Dataset'],
+    y=df['Unigram log probability']
+))
+fig.add_trace(go.Bar(
+    name='Total Lines Remaining',
+    x=df['Dataset'],
+    y=df['Total Lines Remaining']
+))
+# Update the layout
+fig.update_layout(
+    barmode='stack',
+    title='Stacked Bar Chart of Line Reductions by Dataset',
+    xaxis_title='Dataset',
+    yaxis_title='Number of Lines',
+    legend_title='Filters',
+    height=600,
+    width=1000
+)
+# Show the plot
+stacked_bar = fig
 def curated(request):
     # Partial Updates
             data_preprocessing_div,
             plotly2fasthtml(get_chart_28168342()),
             plotly2fasthtml(get_chart_new()),
+            plotly2fasthtml(stacked_bar),
             H2("Curated Sources Processing"),
             filtering_process,
             data_preparation_div,