File size: 4,431 Bytes
5ae38c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
from threading import Thread
from open_lm.hf import *
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from gradio.layouts import Accordion
# Define model options
MODEL_OPTIONS = {
"TRI-ML/DCLM-1B": "TRI-ML/DCLM-1B",
"Apple DCLM-Baseline-7B": "apple/DCLM-Baseline-7B"
}
# Global variables for model and tokenizer
current_model = None
current_tokenizer = None
def load_model(model_name):
global current_model, current_tokenizer
current_tokenizer = AutoTokenizer.from_pretrained(MODEL_OPTIONS[model_name])
current_model = AutoModelForCausalLM.from_pretrained(MODEL_OPTIONS[model_name])
device = "cuda" if torch.cuda.is_available() else "cpu"
current_model = current_model.to(device)
return f"Loaded model: {model_name}"
def generate(
prompt, model_choice, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
):
global current_model, current_tokenizer
if current_model is None or current_tokenizer is None:
return "Please load a model first."
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
inputs = current_tokenizer(prompt, return_tensors="pt").to(current_model.device)
generate_kwargs = dict(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
pad_token_id=current_tokenizer.eos_token_id
)
streamer = TextIteratorStreamer(current_tokenizer, skip_prompt=True, skip_special_tokens=True)
generate_kwargs["streamer"] = streamer
thread = Thread(target=current_model.generate, kwargs=generate_kwargs)
thread.start()
# Write the prompt in blue
output = "<span style='color: blue;'>" + prompt + "</span>"
for new_text in streamer:
if isinstance(new_text, torch.Tensor):
new_text = current_tokenizer.decode(new_text)
output += new_text
yield output
thread.join()
return output
additional_inputs=[
gr.Slider(
label="Temperature",
value=0.9,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Max new tokens",
value=256,
minimum=0,
maximum=1048,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Slider(
label="Top-p (nucleus sampling)",
value=0.90,
minimum=0.0,
maximum=1,
step=0.05,
interactive=True,
info="Higher values sample more low-probability tokens",
),
gr.Slider(
label="Repetition penalty",
value=1.2,
minimum=1.0,
maximum=2.0,
step=0.05,
interactive=True,
info="Penalize repeated tokens",
)
]
with gr.Blocks() as demo:
gr.Markdown(
"""
# DCLM Text Completion Demo
This demo allows you to generate text using a DCLM model.
These models are trained to predict the next word in a sequence of text, and can be used to generate text completions, they are not chatbots.
First select a model from the dropdown and click "Load Model".
Then enter some text in the text box and click "Generate" to see the model's completion.
"""
)
with gr.Row():
model_dropdown = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), label="Select Model")
model_dropdown.select(
load_model,
inputs=[model_dropdown],
outputs=[gr.Textbox(label="Model Status")]
)
text_input = gr.Textbox(lines=3, label="Input Text")
text_output = gr.HTML(label="Generated Text")
generate_button = gr.Button("Generate")
generate_button.click(
generate,
inputs=[text_input, model_dropdown, *additional_inputs],
outputs=[text_output]
)
with Accordion(label="Advanced Options", open=False):
for input_component in additional_inputs:
if not input_component.is_rendered:
input_component.render()
demo.launch()
|