from fasthtml.common import * from fasthtml.components import * from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline from plotly import graph_objects as go from fh_plotly import plotly2fasthtml import pandas as pd import json from rich import print import curated import web import common import results dataset_comparison = pd.DataFrame( { "Dataset": [ "TxT360", "FineWeb", "RefinedWeb", "RedPajama-v2", "C4", "Dolma", "RedPajama-v1", "The Pile", ], "CommonCrawl": [ "99 Snapshots", "96 Snapshots", "90 Snapshots", "84 Snapshots", "1 Snapshots", "24 Snapshots", "5 Snapshots", "0.6% of 74 Snapshots", ], "Papers": [ "5 Sources", "-", "-", "-", "-", "1 Source", "1 Source", "4 Sources", ], "Wikipedia": [ "310+ Languages", "-", "-", "-", "-", "what does a check mark mean?", "what does a check mark mean?", "English Only", ], "FreeLaw": [ "Included", "-", "-", "-", "-", "-", "-", "Included", ], "DM Math": [ "Included", "-", "-", "-", "-", "-", "-", "Included", ], "USPTO": [ "Included", "-", "-", "-", "-", "-", "-", "Included", ], "PG-19": [ "Included", "-", "-", "-", "-", "Included", "Included", "Included", ], "HackerNews": [ "Included", "-", "-", "-", "-", "-", "-", "Included", ], "Ubuntu IRC": [ "Included", "-", "-", "-", "-", "-", "-", "Included", ], "EuroParl": [ "Included", "-", "-", "-", "-", "-", "-", "Included", ], "StackExchange": [ "Included", "-", "-", "-", "-", "-", "Included", "Included", ], "Code": [ "- what is this?", "-", "-", "-", "-", "Included", "Included", "Included", ], } ) table_html = dataset_comparison.to_html(index=False, border=0) table_div = Div(NotStr(table_html), style="margin: 40px;") dataset_sources = pd.DataFrame( { "Data Source": [ "CommonCrawl", "Papers", "Wikipedia", "Freelaw", "DM Math", "USPTO", "PG-19", "HackerNews", "Ubuntu IRC", "Europarl", "StackExchange", ], "Raw Data Size": [ "11 TB", "712 GB", "210 GB", "23 GB", "22 GB", "45 GB", "11 GB", "4.1 GB", "4.7 GB", "6.1 GB", "45 GB", ], "Token Count": [ "5.71T", "154.96B", "4.75B", "7.34B", "5.23B", "4.95B", "2.94B", "1.08B", "1.54B", "1.96B", "8.37B", ], "Cut-Off Date": [ "2024-30", "Q4 2023", "-", "Q1 2024", "-", "Q4 2023", "-", "Q4 2023", "Q4 2023", "-", "Q4 2023", ], } ) table_html = dataset_sources.to_html(index=False, border=0) table_div1 = Div(NotStr(table_html), style="margin: 40px;") def overview(): return Div(Section( H2("Combining the Best of Web and Curated Sources"), H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"), P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."), table_div, P("Table 2: Statistics of TxT360. The basic statistics of TxT360 are presented."), table_div1, id="inner-text", ) )