fh-new-vm1

Sleeping

App Files Files Community

fh-new-vm1 / overview.py

victormiller

Update overview.py

17b2190 verified 2 months ago

raw

history blame

5.81 kB

	from fasthtml.common import *
	from fasthtml.components import *
	from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
	from plotly import graph_objects as go
	from fh_plotly import plotly2fasthtml
	import pandas as pd
	import json
	from rich import print
	import curated
	import web
	import common
	import results

	dataset_comparison = pd.DataFrame(
	{
	"Dataset": [
	"TxT360",
	"FineWeb",
	"RefinedWeb",
	"RedPajama-v2",
	"C4",
	"Dolma",
	"RedPajama-v1",
	"The Pile",
	],
	"CommonCrawl": [
	"99 Snapshots",
	"96 Snapshots",
	"90 Snapshots",
	"84 Snapshots",
	"1 Snapshots",
	"24 Snapshots",
	"5 Snapshots",
	"0.6% of 74 Snapshots",
	],
	"Papers": [
	"5 Sources",
	"-",
	"-",
	"-",
	"-",
	"1 Source",
	"1 Source",
	"4 Sources",
	],
	"Wikipedia": [
	"310+ Languages",
	"-",
	"-",
	"-",
	"-",
	"what does a check mark mean?",
	"what does a check mark mean?",
	"English Only",
	],
	"FreeLaw": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	"Included",
	],
	"DM Math": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	"Included",
	],
	"USPTO": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	"Included",
	],
	"PG-19": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"Included",
	"Included",
	"Included",
	],
	"HackerNews": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	"Included",
	],
	"Ubuntu IRC": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	"Included",
	],
	"EuroParl": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"-",
	"-",
	"Included",
	],
	"StackExchange": [
	"Included",
	"-",
	"-",
	"-",
	"-",
	"-",
	"Included",
	"Included",
	],
	"Code": [
	"- what is this?",
	"-",
	"-",
	"-",
	"-",
	"Included",
	"Included",
	"Included",
	],
	}
	)

	table_html = dataset_comparison.to_html(index=False, border=0)
	table_div = Div(NotStr(table_html), style="margin: 40px;")

	dataset_sources = pd.DataFrame(
	{
	"Data Source": [
	"CommonCrawl",
	"Papers",
	"Wikipedia",
	"Freelaw",
	"DM Math",
	"USPTO",
	"PG-19",
	"HackerNews",
	"Ubuntu IRC",
	"Europarl",
	"StackExchange",
	],
	"Raw Data Size": [
	"11 TB",
	"712 GB",
	"210 GB",
	"23 GB",
	"22 GB",
	"45 GB",
	"11 GB",
	"4.1 GB",
	"4.7 GB",
	"6.1 GB",
	"45 GB",
	],
	"Token Count": [
	"5.71T",
	"154.96B",
	"4.75B",
	"7.34B",
	"5.23B",
	"4.95B",
	"2.94B",
	"1.08B",
	"1.54B",
	"1.96B",
	"8.37B",
	],
	"Cut-Off Date": [
	"2024-30",
	"Q4 2023",
	"-",
	"Q1 2024",
	"-",
	"Q4 2023",
	"-",
	"Q4 2023",
	"Q4 2023",
	"-",
	"Q4 2023",
	],
	}
	)

	table_html = dataset_sources.to_html(index=False, border=0)
	table_div1 = Div(NotStr(table_html), style="margin: 40px;")

	def overview():
	return Div(Section(
	H2("Combining the Best of Web and Curated Sources"),
	H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
	P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
	table_div,
	P("Table 2: Statistics of TxT360. The basic statistics of TxT360 are presented."),
	table_div1,
	id="inner-text",
	)
	)