ml-summit / src /scripts /plot.ju.py
facat's picture
init
2fc4496 unverified
raw
history blame
11.3 kB
# %%
# %cd ~/docs/0425-ml_summit/scripts/
import plotly.express as px
from plotly.graph_objs import Figure, FigureWidget
import datasets
import pandas as pd
import huggingface_hub
import plotly.graph_objs as go
import numpy as np
from PIL import Image
FIGURES: dict[str, Figure] = {}
# %%
df = pd.read_csv("nlp_datas.csv")
fig = px.treemap(
df,
path=[px.Constant("nlp-datasets"), "task", "dataset"],
values="size",
# color="dataset",
# hover_data=["iso_alpha"],
# color_continuous_scale="RdBu",
)
FIGURES["nlp"] = fig
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
# autosize=True,
margin=dict(t=0, l=0, r=0, b=0),
# plot_bgcolor='rgba(0,0,0,0)',
)
# fig.update_traces(marker=dict(pattern=dict(shape=["|"], solidity=0.80)))
# fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
# figs.append(fig)
fig
# %%
df = pd.read_csv("llm.csv")
fig = px.treemap(
df,
path=[px.Constant("LLM"), "dataset"],
values="size",
# color="dataset",
# hover_data=["iso_alpha"],
# color_continuous_scale="RdBu",
)
FIGURES["gpt"] = fig
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
# autosize=True,
margin=dict(t=0, l=0, r=0, b=0),
# plot_bgcolor='rgba(0,0,0,0)',
)
# fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig
# %%
df = pd.read_csv("./seq-time.csv", index_col=0)
df.index = df.index.map(lambda x: eval(x.replace("k", "*1024")))
df["platformers"] = df["platformers"] / 7
df.drop([df.columns[-1]], axis=1, inplace=True)
df = df.reset_index(names="sequence length").melt(
id_vars="sequence length", var_name="model", value_name="time"
)
fig = px.line(df, x="sequence length", y="time", color="model")
FIGURES["seq-time"] = fig
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
# autosize=True,
margin=dict(t=0, l=0, r=0, b=0),
plot_bgcolor="rgba(0,0,0,0)",
legend_font=dict(color="white"),
)
fig.update_xaxes(
color="white",
)
fig.update_yaxes(
# showticklabels=False,
# zeroline=False,
# showline=False,
# griddash="4px",
# gridcolor="rgba(255,255,255,0.3)",
# title="Loss",
color="white",
)
fig
# %%
df = pd.read_csv("seq-tflops.csv", index_col=0)
# df['sequence length']
# df.index = df.index.map(lambda x: eval(x.replace("K", "*1024")))
df = df.reset_index(names="sequence length").melt(
id_vars="sequence length", var_name="model", value_name="tflops"
)
fig = px.bar(df, x="sequence length", y="tflops", color="model", barmode="group")
FIGURES["seq-tflops"] = fig
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
# autosize=True,
margin=dict(t=0, l=0, r=0, b=0),
plot_bgcolor="rgba(0,0,0,0)",
legend_font=dict(color="white"),
)
fig.update_xaxes(
color="white",
)
fig.update_yaxes(
# showticklabels=False,
# zeroline=False,
# showline=False,
# griddash="4px",
# gridcolor="rgba(255,255,255,0.3)",
# title="Loss",
color="white",
)
fig
# %%
df = datasets.load_dataset("SUSTech/webvid", split="train[:100]").to_pandas()
df = df.drop(["duration"], axis=1)
fig = go.Figure(
data=[
go.Table(
header=dict(
values=list(df.columns), fill_color="paleturquoise", align="left"
),
cells=dict(
values=[df[col] for col in df.columns],
fill_color="lavender",
align="left",
# alignsrc="center",
),
)
]
)
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
# autosize=True,
margin=dict(t=0, l=0, r=0, b=0),
# plot_bgcolor='rgba(0,0,0,0)',
)
# fig.show()
FIGURES["webvid"] = fig
# %%
fig = go.Figure()
data = {
"402-page transcripts from Apollo 11’s mission to the moon": 326914,
"44-minute silent Buster Keaton movie": 696417,
"more than 100,000 lines of code": 816767,
"Generate 1min video": 1000000,
}
df = pd.Series(data, name="token").to_frame().reset_index(names="task")
# df = px.data.gapminder().query("continent == 'Europe' and year == 2007 and pop > 2.e6")
fig = px.bar(
df,
y="token",
x="task",
text_auto=".2s",
# template="ggplot2",
# color="white",
# orientation="h",
)
FIGURES["token-bar"] = fig
fig.update_traces(
textfont_size=12,
textangle=0,
textposition="outside",
cliponaxis=False,
textfont_color="white",
)
fig.update_layout(
paper_bgcolor="rgba(0,0,0,0)",
# autosize=True,
margin=dict(t=0, l=0, r=0, b=0),
plot_bgcolor="rgba(0,0,0,0)",
legend_font=dict(color="white"),
)
fig.update_xaxes(
color="white",
# showticklabels=False,
zeroline=False,
showline=False,
showgrid=False,
title="",
)
fig.update_yaxes(
# showticklabels=False,
showline=False,
showgrid=False,
zeroline=False,
# griddash="4px",
# gridcolor="rgba(255,255,255,0.3)",
# title="Loss",
color="white",
)
fig
# %%
def generate_loss(steps, initial_loss, decay_rate, noise_factor):
loss = initial_loss * np.exp(-decay_rate * steps)
noise = noise_factor * loss * np.random.randn(*steps.shape)
return loss + noise
def splitpoints(total, split):
step = total // split
for i in range(split - 1):
yield slice(i * step, (i + 1) * step)
yield slice((i + 1) * step, None)
meta = [
{
"name": "2xDGX on aws",
"color": "red",
"icon": "../figures/gc.png",
},
{
"name": "16xDGX on aliyun",
"color": "orange",
"icon": "../figures/aws-white.png",
},
{
"name": "128xDGX on ucloud",
"color": "blue",
"icon": "../figures/aliyun.png",
},
]
steps = np.linspace(0, 1, 1000)
loss = generate_loss(steps, initial_loss=1, decay_rate=5, noise_factor=0.1)
fig = go.Figure()
# fig.update_layout(
# title="Training Loss by Steps", xaxis_title="Steps", yaxis_title="Loss"
# )
FIGURES["cloud-switch"] = fig
for i, idx in enumerate(splitpoints(1000, len(meta))):
fig.add_trace(
go.Scatter(
x=steps[idx],
y=loss[idx],
mode="lines",
name=meta[i]["name"],
line=dict(color=meta[i]["color"]),
)
)
fig.add_layout_image(
x=0.8,
sizex=0.2,
y=0.2,
sizey=0.2,
xref="paper",
yref="paper",
opacity=1.0,
layer="above",
source=Image.open("../figures/logo/ucloud.png"),
)
fig.add_layout_image(
x=0.17,
sizex=0.15,
y=0.7,
sizey=0.15,
xref="paper",
yref="paper",
opacity=1.0,
layer="above",
source=Image.open("../figures/aws-white.png"),
)
fig.add_layout_image(
x=0.43,
sizex=0.15,
y=0.3,
sizey=0.15,
xref="paper",
yref="paper",
opacity=1.0,
layer="above",
source=Image.open("../figures/aliyun.png"),
)
fig.update_layout(
showlegend=False,
paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(255,255,255,0)",
# plot_bgcolor="rgba(255,255,0)",
# width=1120,
)
fig.update_xaxes(
showticklabels=False,
# ticklabelposition="inside left",
showline=False,
zeroline=False,
showgrid=False,
# title=dict(text="Steps", standoff=250),
automargin=True,
)
fig.update_yaxes(
showticklabels=False,
zeroline=False,
showline=False,
griddash="4px",
gridcolor="rgba(255,255,255,0.3)",
title="Loss",
color="white",
)
fig
# %%
def plot_gantt(df):
fig = px.timeline(df, x_start="Start", x_end="End", y="Task", color="Task")
fig.update_layout(xaxis_tickformat="%H:%M")
fig.update_layout(
showlegend=False,
paper_bgcolor="rgba(0,0,0,0)",
# plot_bgcolor="rgba(255,255,255,0.3)",
plot_bgcolor="rgba(255,255,255,0)",
# plot_bgcolor="rgba(255,255,0)",
# width=1120,
)
fig.update_xaxes(
showticklabels=False,
# ticklabelposition="inside left",
showline=False,
zeroline=False,
showgrid=False,
# title=dict(text="Steps", standoff=250),
automargin=True,
)
fig.update_yaxes(
# showticklabels=False,
zeroline=False,
showline=False,
griddash="4px",
gridcolor="rgba(0,0,0,0.3)",
title="",
color="white",
tickfont=dict(size=20),
)
return fig
# for hour slots randonly assign a task
num_rows = 1000
download_prop = 0.65
df = pd.DataFrame(
{"Start": pd.date_range("1-jan-2021", periods=num_rows, freq="4h")}
).assign(
End=lambda d: d.Start + pd.Timedelta(hours=1),
Task=np.random.choice(
["Read", "Transform"], num_rows, p=(download_prop, 1 - download_prop)
),
)
df.loc[0, "Task"] = "Read"
df.loc[len(df) - 1, "Task"] = "Transform"
df = df.groupby(df.Task.ne(df.Task.shift()).cumsum()).agg(
{"Start": "min", "End": "max", "Task": "first"}
)
timeline = df.copy()
# %%
df = timeline.copy()
ddi = pd.date_range(df.iloc[0].Start, end=df.iloc[-1].End, periods=10)
for start, end in zip(ddi[2:-1:3], ddi[3::3]):
df.loc[df["Start"].between(start, end), "Task"] = "Train"
df.loc[len(df) + 1] = pd.Series({"Start": start, "End": end, "Task": "Train"})
FIGURES["profile-naive"] = plot_gantt(df)
FIGURES["profile-naive"]
# %%
df = timeline.copy()
prop = 10
ddi = pd.date_range(df.iloc[0].Start, end=df.iloc[-1].End, periods=(prop + 1) * 10)
for start, end in zip(ddi[1 : -1 : prop + 1], ddi[prop :: prop + 1]):
df.loc[df["Start"].between(start, end), "Task"] = "Train"
df.loc[len(df) + 1] = pd.Series({"Start": start, "End": end, "Task": "Train"})
FIGURES["profile-old"] = plot_gantt(df)
FIGURES["profile-old"]
# %%
df = timeline.copy()
df.loc[len(df) + 1] = pd.Series(
{"Start": df.iloc[0].Start, "End": df.iloc[-1].Start, "Task": "Train"}
)
FIGURES["profile-stream"] = plot_gantt(df)
FIGURES["profile-stream"]
# %%
for k, v in FIGURES.items():
print(k)
v.write_html(
f"../components/{k}.qmd",
full_html=False,
include_plotlyjs="cdn",
)
# for i in range(100):
# print(i)
# %%
import qrcode
from qrcode.image.styledpil import StyledPilImage
from qrcode.image.styles.moduledrawers.pil import RoundedModuleDrawer
from qrcode.image.styles.colormasks import RadialGradiantColorMask
qr = qrcode.QRCode(error_correction=qrcode.constants.ERROR_CORRECT_L)
qr.add_data("https://u.wechat.com/MAmdMGMYjGFC4-2ESxZ1oyw")
# img_1 = qr.make_image(image_factory=StyledPilImage, module_drawer=RoundedModuleDrawer())
img_2 = qr.make_image(
# image_factory=StyledPilImage,
# color_mask=RadialGradiantColorMask(),
fill_color="white",
back_color="transparent",
)
# img_3 = qr.make_image(
# image_factory=StyledPilImage, embeded_image_path="../figures/qr/code.png"
# )
img_2.save("../figures/qr/jing.png")
# %%
qr = qrcode.QRCode(error_correction=qrcode.constants.ERROR_CORRECT_L)
qr.add_data("mailto:data@sustech.edu.cn?subject=Hello&body=")
# img_1 = qr.make_image(image_factory=StyledPilImage, module_drawer=RoundedModuleDrawer())
img_2 = qr.make_image(
# image_factory=StyledPilImage,
# color_mask=RadialGradiantColorMask(),
fill_color="white",
back_color="transparent",
)
# img_3 = qr.make_image(
# image_factory=StyledPilImage, embeded_image_path="../figures/qr/code.png"
# )
img_2.save("../figures/qr/mail-data.png")