Spaces:
Sleeping
Sleeping
from fasthtml.common import * | |
from fasthtml.components import * | |
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline | |
from plotly import graph_objects as go | |
from fh_plotly import plotly2fasthtml | |
import pandas as pd | |
import json | |
from rich import print | |
import curated | |
import web | |
import common | |
import results | |
dataset_comparison = pd.DataFrame( | |
{ | |
"Dataset": [ | |
"TxT360", | |
"FineWeb", | |
"RefinedWeb", | |
"RedPajama-v2", | |
"C4", | |
"Dolma", | |
"RedPajama-v1", | |
"The Pile", | |
], | |
"CommonCrawl": [ | |
"99 Snapshots", | |
"96 Snapshots", | |
"90 Snapshots", | |
"84 Snapshots", | |
"1 Snapshots", | |
"24 Snapshots", | |
"5 Snapshots", | |
"0.6% of 74 Snapshots", | |
], | |
"Papers": [ | |
"5 Sources", | |
"-", | |
"-", | |
"-", | |
"-", | |
"1 Source", | |
"1 Source", | |
"4 Sources", | |
], | |
"Wikipedia": [ | |
"310+ Languages", | |
"-", | |
"-", | |
"-", | |
"-", | |
"what does a check mark mean?", | |
"what does a check mark mean?", | |
"English Only", | |
], | |
"FreeLaw": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
], | |
"DM Math": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
], | |
"USPTO": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
], | |
"PG-19": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
"Included", | |
"Included", | |
], | |
"HackerNews": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
], | |
"Ubuntu IRC": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
], | |
"EuroParl": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
], | |
"StackExchange": [ | |
"Included", | |
"-", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
"Included", | |
], | |
"Code": [ | |
"- what is this?", | |
"-", | |
"-", | |
"-", | |
"-", | |
"Included", | |
"Included", | |
"Included", | |
], | |
} | |
) | |
table_html = dataset_comparison.to_html(index=False, border=0) | |
table_div = Div(NotStr(table_html), style="margin: 40px;") | |
dataset_sources = pd.DataFrame( | |
{ | |
"Data Source": [ | |
"CommonCrawl", | |
"Papers", | |
"Wikipedia", | |
"Freelaw", | |
"DM Math", | |
"USPTO", | |
"PG-19", | |
"HackerNews", | |
"Ubuntu IRC", | |
"Europarl", | |
"StackExchange", | |
], | |
"Raw Data Size": [ | |
"11 TB", | |
"712 GB", | |
"210 GB", | |
"23 GB", | |
"22 GB", | |
"45 GB", | |
"11 GB", | |
"4.1 GB", | |
"4.7 GB", | |
"6.1 GB", | |
"45 GB", | |
], | |
"Token Count": [ | |
"5.71T", | |
"154.96B", | |
"4.75B", | |
"7.34B", | |
"5.23B", | |
"4.95B", | |
"2.94B", | |
"1.08B", | |
"1.54B", | |
"1.96B", | |
"8.37B", | |
], | |
"Cut-Off Date": [ | |
"2024-30", | |
"Q4 2023", | |
"-", | |
"Q1 2024", | |
"-", | |
"Q4 2023", | |
"-", | |
"Q4 2023", | |
"Q4 2023", | |
"-", | |
"Q4 2023", | |
], | |
} | |
) | |
table_html = dataset_sources.to_html(index=False, border=0) | |
table_div1 = Div(NotStr(table_html), style="margin: 40px;") | |
def overview(): | |
return Div(Section( | |
H2("Combining the Best of Web and Curated Sources"), | |
H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"), | |
P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."), | |
table_div, | |
P("Table 2: Statistics of TxT360. The basic statistics of TxT360 are presented."), | |
table_div1, | |
id="inner-text", | |
) | |
) | |