File size: 3,884 Bytes
fbe940a
6b97460
4f83ec0
 
 
 
 
 
6b97460
6447366
6b97460
 
 
6447366
4f83ec0
fbe940a
 
 
6b97460
 
 
 
 
 
 
fbe940a
 
 
 
 
 
 
 
 
 
6b97460
4f83ec0
6b97460
 
fbe940a
4f83ec0
 
 
 
fbe940a
 
 
 
 
 
 
 
 
 
 
 
 
4f83ec0
 
 
 
 
fbe940a
6b97460
 
4f83ec0
 
 
 
 
fbe940a
 
4f83ec0
 
 
 
6b97460
fbe940a
 
 
 
 
 
 
4f83ec0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from pathlib import Path
from urllib.parse import urlparse, parse_qs

import gradio as gr
import io
import pandas as pd
import spaces

from generate import stream_jsonl_file

MAX_SIZE = 20
DEFAULT_SEED = 42
DEFAULT_SIZE = 3

@spaces.GPU(duration=120)
def stream_output(query: str, continue_content: str = ""):
    query = Path(query).name
    parsed_filename = urlparse(query)
    filename = parsed_filename.path
    params = parse_qs(parsed_filename.query)
    prompt = params["prompt"][0] if "prompt" in params else ""
    columns = [column.strip() for column in params["columns"][0].split(",") if column.strip()] if "columns" in params else []
    size = int(params["size"][0]) if "size" in params else DEFAULT_SIZE
    seed = int(params["seed"][0]) if "seed" in params else DEFAULT_SEED
    if size > MAX_SIZE:
        raise gr.Error(f"Maximum size is {MAX_SIZE}. Duplicate this Space to remove this limit.")
    content = continue_content
    df = pd.read_json(io.StringIO(content), lines=True)
    continue_content_size = len(df)
    state_msg = f"⚙️ Generating... [{continue_content_size + 1}/{continue_content_size + size}]"
    if list(df.columns):
        columns = list(df.columns)
    else:
        df = pd.DataFrame({"1": [], "2": [], "3": []})
    yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
    for i, chunk in enumerate(stream_jsonl_file(
        filename=filename,
        prompt=prompt,
        columns=columns,
        seed=seed + (continue_content_size // size),
        size=size,
    )):
        content += chunk
        df = pd.read_json(io.StringIO(content), lines=True)
        state_msg = f"⚙️ Generating... [{continue_content_size + i + 1}/{continue_content_size + size}]"
        yield df, "```json\n" + content + "\n```", gr.Button(state_msg), gr.Button("Generate one more batch", interactive=False), gr.DownloadButton("⬇️ Download", interactive=False)
    with open(query, "w", encoding="utf-8") as f:
        f.write(content)
    yield df, "```json\n" + content + "\n```", gr.Button("Generate dataset"), gr.Button("Generate one more batch", visible=True, interactive=True), gr.DownloadButton("⬇️ Download", value=query, visible=True, interactive=True)


def stream_more_output(query: str):
    query = Path(query).name
    with open(query, "r", encoding="utf-8") as f:
        continue_content = f.read()
    yield from stream_output(query=query, continue_content=continue_content)


title = "LLM DataGen"
description = "Generate and stream synthetic dataset files in JSON Lines format"
examples = [
    "movies_data.jsonl",
    "dungeon_and_dragon_characters.jsonl",
    "bad_amazon_reviews_on_defunct_products_that_people_hate.jsonl",
    "common_first_names.jsonl?columns=first_name,popularity&size=10",
]

with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    filename_comp = gr.Textbox(examples[0], placeholder=examples[0], label="File name to generate")
    outputs = []
    generate_button = gr.Button("Generate dataset")
    with gr.Tab("Dataset"):
        dataframe_comp = gr.DataFrame()
    with gr.Tab("File content"):
        file_content_comp = gr.Markdown()
    with gr.Row():
        generate_more_button = gr.Button("Generate one more batch", visible=False, interactive=False, scale=3)
        download_button = gr.DownloadButton("⬇️ Download", visible=False, interactive=False, scale=1)
    outputs = [dataframe_comp, file_content_comp, generate_button, generate_more_button, download_button]
    examples = gr.Examples(examples, filename_comp, outputs, fn=stream_output, run_on_click=True)
    generate_button.click(stream_output, filename_comp, outputs)
    generate_more_button.click(stream_more_output, filename_comp, outputs)


demo.launch()