Spaces:

facat
/

ml-summit

Runtime error

App Files Files Community

ml-summit / src /scripts /plot.ju.py

facat

init

2fc4496 unverified 3 months ago

raw

history blame contribute delete

No virus

11.3 kB

	# %%

	# %cd ~/docs/0425-ml_summit/scripts/
	import plotly.express as px
	from plotly.graph_objs import Figure, FigureWidget
	import datasets
	import pandas as pd
	import huggingface_hub
	import plotly.graph_objs as go
	import numpy as np
	from PIL import Image

	FIGURES: dict[str, Figure] = {}
	# %%

	df = pd.read_csv("nlp_datas.csv")
	fig = px.treemap(
	df,
	path=[px.Constant("nlp-datasets"), "task", "dataset"],
	values="size",
	# color="dataset",
	# hover_data=["iso_alpha"],
	# color_continuous_scale="RdBu",
	)

	FIGURES["nlp"] = fig
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	# autosize=True,
	margin=dict(t=0, l=0, r=0, b=0),
	# plot_bgcolor='rgba(0,0,0,0)',
	)
	# fig.update_traces(marker=dict(pattern=dict(shape=["\|"], solidity=0.80)))
	# fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
	# figs.append(fig)
	fig
	# %%
	df = pd.read_csv("llm.csv")
	fig = px.treemap(
	df,
	path=[px.Constant("LLM"), "dataset"],
	values="size",
	# color="dataset",
	# hover_data=["iso_alpha"],
	# color_continuous_scale="RdBu",
	)
	FIGURES["gpt"] = fig
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	# autosize=True,
	margin=dict(t=0, l=0, r=0, b=0),
	# plot_bgcolor='rgba(0,0,0,0)',
	)
	# fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
	fig
	# %%

	df = pd.read_csv("./seq-time.csv", index_col=0)
	df.index = df.index.map(lambda x: eval(x.replace("k", "*1024")))
	df["platformers"] = df["platformers"] / 7
	df.drop([df.columns[-1]], axis=1, inplace=True)
	df = df.reset_index(names="sequence length").melt(
	id_vars="sequence length", var_name="model", value_name="time"
	)
	fig = px.line(df, x="sequence length", y="time", color="model")
	FIGURES["seq-time"] = fig
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	# autosize=True,
	margin=dict(t=0, l=0, r=0, b=0),
	plot_bgcolor="rgba(0,0,0,0)",
	legend_font=dict(color="white"),
	)
	fig.update_xaxes(
	color="white",
	)
	fig.update_yaxes(
	# showticklabels=False,
	# zeroline=False,
	# showline=False,
	# griddash="4px",
	# gridcolor="rgba(255,255,255,0.3)",
	# title="Loss",
	color="white",
	)
	fig
	# %%

	df = pd.read_csv("seq-tflops.csv", index_col=0)
	# df['sequence length']
	# df.index = df.index.map(lambda x: eval(x.replace("K", "*1024")))
	df = df.reset_index(names="sequence length").melt(
	id_vars="sequence length", var_name="model", value_name="tflops"
	)
	fig = px.bar(df, x="sequence length", y="tflops", color="model", barmode="group")
	FIGURES["seq-tflops"] = fig
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	# autosize=True,
	margin=dict(t=0, l=0, r=0, b=0),
	plot_bgcolor="rgba(0,0,0,0)",
	legend_font=dict(color="white"),
	)

	fig.update_xaxes(
	color="white",
	)
	fig.update_yaxes(
	# showticklabels=False,
	# zeroline=False,
	# showline=False,
	# griddash="4px",
	# gridcolor="rgba(255,255,255,0.3)",
	# title="Loss",
	color="white",
	)
	fig
	# %%


	df = datasets.load_dataset("SUSTech/webvid", split="train[:100]").to_pandas()

	df = df.drop(["duration"], axis=1)


	fig = go.Figure(
	data=[
	go.Table(
	header=dict(
	values=list(df.columns), fill_color="paleturquoise", align="left"
	),
	cells=dict(
	values=[df[col] for col in df.columns],
	fill_color="lavender",
	align="left",
	# alignsrc="center",
	),
	)
	]
	)

	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	# autosize=True,
	margin=dict(t=0, l=0, r=0, b=0),
	# plot_bgcolor='rgba(0,0,0,0)',
	)
	# fig.show()
	FIGURES["webvid"] = fig
	# %%

	fig = go.Figure()

	data = {
	"402-page transcripts from Apollo 11’s mission to the moon": 326914,
	"44-minute silent Buster Keaton movie": 696417,
	"more than 100,000 lines of code": 816767,
	"Generate 1min video": 1000000,
	}

	df = pd.Series(data, name="token").to_frame().reset_index(names="task")


	# df = px.data.gapminder().query("continent == 'Europe' and year == 2007 and pop > 2.e6")
	fig = px.bar(
	df,
	y="token",
	x="task",
	text_auto=".2s",
	# template="ggplot2",
	# color="white",
	# orientation="h",
	)
	FIGURES["token-bar"] = fig

	fig.update_traces(
	textfont_size=12,
	textangle=0,
	textposition="outside",
	cliponaxis=False,
	textfont_color="white",
	)
	fig.update_layout(
	paper_bgcolor="rgba(0,0,0,0)",
	# autosize=True,
	margin=dict(t=0, l=0, r=0, b=0),
	plot_bgcolor="rgba(0,0,0,0)",
	legend_font=dict(color="white"),
	)

	fig.update_xaxes(
	color="white",
	# showticklabels=False,
	zeroline=False,
	showline=False,
	showgrid=False,
	title="",
	)
	fig.update_yaxes(
	# showticklabels=False,
	showline=False,
	showgrid=False,
	zeroline=False,
	# griddash="4px",
	# gridcolor="rgba(255,255,255,0.3)",
	# title="Loss",
	color="white",
	)
	fig


	# %%
	def generate_loss(steps, initial_loss, decay_rate, noise_factor):
	loss = initial_loss * np.exp(-decay_rate * steps)
	noise = noise_factor * loss * np.random.randn(*steps.shape)
	return loss + noise


	def splitpoints(total, split):
	step = total // split
	for i in range(split - 1):
	yield slice(i * step, (i + 1) * step)
	yield slice((i + 1) * step, None)


	meta = [
	{
	"name": "2xDGX on aws",
	"color": "red",
	"icon": "../figures/gc.png",
	},
	{
	"name": "16xDGX on aliyun",
	"color": "orange",
	"icon": "../figures/aws-white.png",
	},
	{
	"name": "128xDGX on ucloud",
	"color": "blue",
	"icon": "../figures/aliyun.png",
	},
	]


	steps = np.linspace(0, 1, 1000)
	loss = generate_loss(steps, initial_loss=1, decay_rate=5, noise_factor=0.1)
	fig = go.Figure()
	# fig.update_layout(
	# title="Training Loss by Steps", xaxis_title="Steps", yaxis_title="Loss"
	# )

	FIGURES["cloud-switch"] = fig
	for i, idx in enumerate(splitpoints(1000, len(meta))):
	fig.add_trace(
	go.Scatter(
	x=steps[idx],
	y=loss[idx],
	mode="lines",
	name=meta[i]["name"],
	line=dict(color=meta[i]["color"]),
	)
	)
	fig.add_layout_image(
	x=0.8,
	sizex=0.2,
	y=0.2,
	sizey=0.2,
	xref="paper",
	yref="paper",
	opacity=1.0,
	layer="above",
	source=Image.open("../figures/logo/ucloud.png"),
	)
	fig.add_layout_image(
	x=0.17,
	sizex=0.15,
	y=0.7,
	sizey=0.15,
	xref="paper",
	yref="paper",
	opacity=1.0,
	layer="above",
	source=Image.open("../figures/aws-white.png"),
	)
	fig.add_layout_image(
	x=0.43,
	sizex=0.15,
	y=0.3,
	sizey=0.15,
	xref="paper",
	yref="paper",
	opacity=1.0,
	layer="above",
	source=Image.open("../figures/aliyun.png"),
	)

	fig.update_layout(
	showlegend=False,
	paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(255,255,255,0)",
	# plot_bgcolor="rgba(255,255,0)",
	# width=1120,
	)
	fig.update_xaxes(
	showticklabels=False,
	# ticklabelposition="inside left",
	showline=False,
	zeroline=False,
	showgrid=False,
	# title=dict(text="Steps", standoff=250),
	automargin=True,
	)
	fig.update_yaxes(
	showticklabels=False,
	zeroline=False,
	showline=False,
	griddash="4px",
	gridcolor="rgba(255,255,255,0.3)",
	title="Loss",
	color="white",
	)
	fig


	# %%
	def plot_gantt(df):
	fig = px.timeline(df, x_start="Start", x_end="End", y="Task", color="Task")

	fig.update_layout(xaxis_tickformat="%H:%M")

	fig.update_layout(
	showlegend=False,
	paper_bgcolor="rgba(0,0,0,0)",
	# plot_bgcolor="rgba(255,255,255,0.3)",
	plot_bgcolor="rgba(255,255,255,0)",
	# plot_bgcolor="rgba(255,255,0)",
	# width=1120,
	)
	fig.update_xaxes(
	showticklabels=False,
	# ticklabelposition="inside left",
	showline=False,
	zeroline=False,
	showgrid=False,
	# title=dict(text="Steps", standoff=250),
	automargin=True,
	)
	fig.update_yaxes(
	# showticklabels=False,
	zeroline=False,
	showline=False,
	griddash="4px",
	gridcolor="rgba(0,0,0,0.3)",
	title="",
	color="white",
	tickfont=dict(size=20),
	)

	return fig


	# for hour slots randonly assign a task
	num_rows = 1000
	download_prop = 0.65
	df = pd.DataFrame(
	{"Start": pd.date_range("1-jan-2021", periods=num_rows, freq="4h")}
	).assign(
	End=lambda d: d.Start + pd.Timedelta(hours=1),
	Task=np.random.choice(
	["Read", "Transform"], num_rows, p=(download_prop, 1 - download_prop)
	),
	)

	df.loc[0, "Task"] = "Read"
	df.loc[len(df) - 1, "Task"] = "Transform"
	df = df.groupby(df.Task.ne(df.Task.shift()).cumsum()).agg(
	{"Start": "min", "End": "max", "Task": "first"}
	)

	timeline = df.copy()
	# %%

	df = timeline.copy()
	ddi = pd.date_range(df.iloc[0].Start, end=df.iloc[-1].End, periods=10)
	for start, end in zip(ddi[2:-1:3], ddi[3::3]):
	df.loc[df["Start"].between(start, end), "Task"] = "Train"
	df.loc[len(df) + 1] = pd.Series({"Start": start, "End": end, "Task": "Train"})

	FIGURES["profile-naive"] = plot_gantt(df)
	FIGURES["profile-naive"]
	# %%

	df = timeline.copy()
	prop = 10
	ddi = pd.date_range(df.iloc[0].Start, end=df.iloc[-1].End, periods=(prop + 1) * 10)
	for start, end in zip(ddi[1 : -1 : prop + 1], ddi[prop :: prop + 1]):
	df.loc[df["Start"].between(start, end), "Task"] = "Train"
	df.loc[len(df) + 1] = pd.Series({"Start": start, "End": end, "Task": "Train"})
	FIGURES["profile-old"] = plot_gantt(df)
	FIGURES["profile-old"]
	# %%

	df = timeline.copy()

	df.loc[len(df) + 1] = pd.Series(
	{"Start": df.iloc[0].Start, "End": df.iloc[-1].Start, "Task": "Train"}
	)
	FIGURES["profile-stream"] = plot_gantt(df)
	FIGURES["profile-stream"]

	# %%

	for k, v in FIGURES.items():
	print(k)
	v.write_html(
	f"../components/{k}.qmd",
	full_html=False,
	include_plotlyjs="cdn",
	)

	# for i in range(100):
	# print(i)
	# %%
	import qrcode
	from qrcode.image.styledpil import StyledPilImage
	from qrcode.image.styles.moduledrawers.pil import RoundedModuleDrawer
	from qrcode.image.styles.colormasks import RadialGradiantColorMask

	qr = qrcode.QRCode(error_correction=qrcode.constants.ERROR_CORRECT_L)
	qr.add_data("https://u.wechat.com/MAmdMGMYjGFC4-2ESxZ1oyw")

	# img_1 = qr.make_image(image_factory=StyledPilImage, module_drawer=RoundedModuleDrawer())
	img_2 = qr.make_image(
	# image_factory=StyledPilImage,
	# color_mask=RadialGradiantColorMask(),
	fill_color="white",
	back_color="transparent",
	)
	# img_3 = qr.make_image(
	# image_factory=StyledPilImage, embeded_image_path="../figures/qr/code.png"
	# )
	img_2.save("../figures/qr/jing.png")
	# %%


	qr = qrcode.QRCode(error_correction=qrcode.constants.ERROR_CORRECT_L)
	qr.add_data("mailto:data@sustech.edu.cn?subject=Hello&body=")

	# img_1 = qr.make_image(image_factory=StyledPilImage, module_drawer=RoundedModuleDrawer())
	img_2 = qr.make_image(
	# image_factory=StyledPilImage,
	# color_mask=RadialGradiantColorMask(),
	fill_color="white",
	back_color="transparent",
	)
	# img_3 = qr.make_image(
	# image_factory=StyledPilImage, embeded_image_path="../figures/qr/code.png"
	# )
	img_2.save("../figures/qr/mail-data.png")