omwdataset

Runtime error

File size: 2,713 Bytes

a4dc57a

import os
from plotly import graph_objects as go
import pandas as pd

## Evaluation Graphs

# Load the data
all_eval_results = {}
for fname in os.listdir("data/txt360_eval"):
    if fname.endswith(".csv"):
        metric_name = fname.replace("CKPT Eval - ", "").replace(".csv", "")
        all_eval_results[metric_name] = {}

        # with open(os.path.join("data/txt360_eval", fname)) as f:
        df = pd.read_csv(os.path.join("data/txt360_eval", fname))

        # slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
        fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") # fineweb
        txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") # txt360-dedup-only
        txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
        txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2

        # each row is 20B tokens.
        # all_eval_results[metric_name]["slimpajama"] = slimpajama_res
        all_eval_results[metric_name]["fineweb"] = fineweb_res
        all_eval_results[metric_name]["txt360-dedup-only"] = txt360_base
        all_eval_results[metric_name]["txt360-web-only-upsampled"] = txt360_web_up
        all_eval_results[metric_name]["txt360-all-upsampled + stackv2"] = txt360_all_up_stack
        all_eval_results[metric_name]["token"] = [20 * i for i in range(len(fineweb_res))]


# Eval Result Plots
all_eval_res_figs = {}
for metric_name, res in all_eval_results.items():
    fig_res = go.Figure()

    # Add lines
    fig_res.add_trace(go.Scatter(
        x=all_eval_results[metric_name]["token"], 
        y=all_eval_results[metric_name]["fineweb"], 
        mode='lines', name='FineWeb'
    ))
    fig_res.add_trace(go.Scatter(
        x=all_eval_results[metric_name]["token"], 
        y=all_eval_results[metric_name]["txt360-web-only-upsampled"], 
        mode='lines', name='TxT360 - CC Data Upsampled'
    ))
    fig_res.add_trace(go.Scatter(
        x=all_eval_results[metric_name]["token"], 
        y=all_eval_results[metric_name]["txt360-dedup-only"], 
        mode='lines', name='TxT360 - CC Data Dedup'
    ))
    fig_res.add_trace(go.Scatter(
        x=all_eval_results[metric_name]["token"], 
        y=all_eval_results[metric_name]["txt360-all-upsampled + stackv2"], 
        mode='lines', name='TxT360 - Full Upsampled + Stack V2'
    ))

    # Update layout
    fig_res.update_layout(
        title=f"{metric_name} Performance",
        title_x=0.5,  # Centers the title
        xaxis_title="Billion Tokens",
        yaxis_title=metric_name,
        legend_title="Dataset",
    )
    all_eval_res_figs[metric_name] = fig_res