Spaces:
Runtime error
Runtime error
import time | |
from urllib.parse import urlparse, parse_qs | |
import gradio as gr | |
import io | |
import pandas as pd | |
import spaces | |
from generate import stream_jsonl_file | |
MAX_SIZE = 20 | |
DEFAULT_SEED = 42 | |
DEFAULT_SIZE = 3 | |
def stream_output(filename: str): | |
parsed_filename = urlparse(filename) | |
filename = parsed_filename.path | |
params = parse_qs(parsed_filename.query) | |
prompt = params["prompt"][0] if "prompt" in params else "" | |
columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else [] | |
size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE | |
seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED | |
if size > MAX_SIZE: | |
yield None, None, "Error: Maximum size is 20" | |
content = "" | |
start_time = time.time() | |
for i, chunk in enumerate(stream_jsonl_file( | |
filename=filename, | |
prompt=prompt, | |
columns=columns, | |
seed=seed, | |
size=size, | |
)): | |
content += chunk | |
df = pd.read_json(io.StringIO(content), lines=True) | |
state_msg = ( | |
f"β Done generating {size} samples in {time.time() - start_time:.2f}s" | |
if i + 1 == size else | |
f"βοΈ Generating... [{i + 1}/{size}]" | |
) | |
yield df, "```json\n" + content + "\n```", state_msg | |
title = "LLM DataGen" | |
description = "Generate and stream synthetic dataset files in JSON Lines format" | |
examples = [ | |
"movies_data.jsonl", | |
"dungeon_and_dragon_characters.jsonl" | |
"bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl", | |
"common_first_names.jsonl?columns=first_name,popularity&size=10", | |
] | |
with gr.Blocks() as demo: | |
gr.Markdown(f"# {title}") | |
gr.Markdown(description) | |
filename_comp = gr.Textbox(examples[0], placeholder=examples[0]) | |
gr.Examples(examples, filename_comp) | |
generate_button = gr.Button("Generate dataset") | |
state_msg_comp = gr.Markdown("π₯ Ready to generate") | |
with gr.Tab("Dataset"): | |
dataframe_comp = gr.DataFrame() | |
with gr.Tab("File content"): | |
file_content_comp = gr.Markdown() | |
generate_button.click(stream_output, filename_comp, [dataframe_comp, file_content_comp, state_msg_comp]) | |
demo.launch() | |