Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,884 Bytes
fbe940a 6b97460 4f83ec0 6b97460 6447366 6b97460 6447366 4f83ec0 fbe940a 6b97460 fbe940a 6b97460 4f83ec0 6b97460 fbe940a 4f83ec0 fbe940a 4f83ec0 fbe940a 6b97460 4f83ec0 fbe940a 4f83ec0 6b97460 fbe940a 4f83ec0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from pathlib import Path
from urllib.parse import urlparse, parse_qs
import gradio as gr
import io
import pandas as pd
import spaces
from generate import stream_jsonl_file
MAX_SIZE = 20
DEFAULT_SEED = 42
DEFAULT_SIZE = 3
@spaces.GPU(duration=120)
def stream_output(query: str, continue_content: str = ""):
query = Path(query).name
parsed_filename = urlparse(query)
filename = parsed_filename.path
params = parse_qs(parsed_filename.query)
prompt = params["prompt"][0] if "prompt" in params else ""
columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
if size > MAX_SIZE:
raise gr.Error(f"Maximum size is {MAX_SIZE}. Duplicate this Space to remove this limit.")
content = continue_content
df = pd.read_json(io.StringIO(content), lines=True)
continue_content_size = len(df)
state_msg = f"⚙️ Generating... [{continue_content_size + 1}/{continue_content_size + size}]"
if list(df.columns):
columns = list(df.columns)
else:
df = pd.DataFrame({"1": [], "2": [], "3": []})
yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
for i, chunk in enumerate(stream_jsonl_file(
filename=filename,
prompt=prompt,
columns=columns,
seed=seed + (continue_content_size // size),
size=size,
)):
content += chunk
df = pd.read_json(io.StringIO(content), lines=True)
state_msg = f"⚙️ Generating... [{continue_content_size + i + 1}/{continue_content_size + size}]"
yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
with open(query, "w", encoding="utf-8") as f:
f.write(content)
yield df, "```json\n" + content + "\n```", gr.Button("Generate dataset"), gr.Button("Generate one more batch", visible=True, interactive=True), gr.DownloadButton("⬇️ Download", value=query, visible=True, interactive=True)
def stream_more_output(query: str):
query = Path(query).name
with open(query, "r", encoding="utf-8") as f:
continue_content = f.read()
yield from stream_output(query=query, continue_content=continue_content)
title = "LLM DataGen"
description = "Generate and stream synthetic dataset files in JSON Lines format"
examples = [
"movies_data.jsonl",
"dungeon_and_dragon_characters.jsonl",
"bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
"common_first_names.jsonl?columns=first_name,popularity&size=10",
]
with gr.Blocks() as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
filename_comp = gr.Textbox(examples[0], placeholder=examples[0], label="File name to generate")
outputs = []
generate_button = gr.Button("Generate dataset")
with gr.Tab("Dataset"):
dataframe_comp = gr.DataFrame()
with gr.Tab("File content"):
file_content_comp = gr.Markdown()
with gr.Row():
generate_more_button = gr.Button("Generate one more batch", visible=False, interactive=False, scale=3)
download_button = gr.DownloadButton("⬇️ Download", visible=False, interactive=False, scale=1)
outputs = [dataframe_comp, file_content_comp, generate_button, generate_more_button, download_button]
examples = gr.Examples(examples, filename_comp, outputs, fn=stream_output, run_on_click=True)
generate_button.click(stream_output, filename_comp, outputs)
generate_more_button.click(stream_more_output, filename_comp, outputs)
demo.launch()
|