# Import necessary libraries from threading import Thread import argparse import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer, AutoModelForCausalLM from peft import PeftConfig, PeftModel from utils import get_device # Angenommen, diese Funktion existiert bereits # Create the parser parser = argparse.ArgumentParser(description='Check model usage.') # Add the arguments parser.add_argument('--baseonly', action='store_true', help='A boolean switch to indicate base only mode') # Execute the parse_args() method args = parser.parse_args() # Define model and adapter names, data type, and quantization type model_name = "microsoft/Phi-3-mini-4k-instruct" adapters_name = "zurd46/eliAI" torch_dtype = torch.bfloat16 # Set the appropriate torch data type # Display device and CPU thread information device = get_device() print("Running on device:", device) print("CPU threads:", torch.get_num_threads()) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Load base model model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype) model.resize_token_embeddings(len(tokenizer)) # Load adapter if available and not baseonly usingAdapter = False if not args.baseonly: usingAdapter = True model = PeftModel.from_pretrained(model, adapters_name) model.to(device) print(f"Model {model_name} loaded successfully on {device}") # Function to run the text generation process def run_generation(user_text, top_p, temperature, top_k, max_new_tokens): template = "\n{}\n" model_inputs = tokenizer(template.format(user_text) if usingAdapter else user_text, return_tensors="pt") model_inputs = model_inputs.to(device) # Generate text in a separate thread streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=model_inputs['input_ids'], streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, temperature=float(temperature), top_k=top_k, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() # Retrieve and yield the generated text model_output = "" for new_text in streamer: model_output += new_text return model_output # Gradio UI setup with gr.Blocks(css=""" div.svelte-sfqy0y { display: flex; flex-direction: inherit; flex-wrap: wrap; gap: var(--form-gap-width); box-shadow: var(--block-shadow); border: var(--block-border-width) solid var(--border-color-primary); border-radius: var(--block-radius); background: var(--block-background-fill); overflow-y: hidden; padding: 20px; } body { font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; background-color: var(--body-background-fill); color: #e0e0e0; margin: 0; padding: 0; box-sizing: border-box; } .gradio-container { max-width: 900px; margin: auto; padding: 20px; border-radius: 8px; box-shadow: 0 0 10px rgba(0,0,0,0.5); background: var(--body-background-fill); } .gr-button { background-color: var(--block-background-fill); color: white; border: none; border-radius: 4px; padding: 10px 24px; cursor: pointer; } .gr-button:hover { background-color: #3700b3; } .gr-slider input[type=range] { -webkit-appearance: none; width: 100%; height: 8px; border-radius: 5px; background: #333; outline: none; opacity: 0.9; -webkit-transition: .2s; transition: opacity .2s; } .gr-slider input[type=range]:hover { opacity: 1; } .gr-textbox { background-color: var(--block-background-fill); color: white; border: none; border-radius: 4px; padding: 10px; } .chatbox { max-height: 400px; overflow-y: auto; margin-bottom: 20px; } """) as demo: gr.Markdown( """

🌙 eliAI Text Generation Interface

Model: Phi-3-mini-4k-instruct

Developed by Daniel Zurmühle

""") with gr.Row(): with gr.Column(scale=3): user_text = gr.Textbox(placeholder="Enter your question here", label="User Input", lines=3, elem_classes="gr-textbox") button_submit = gr.Button(value="Submit", elem_classes="gr-button") max_new_tokens = gr.Slider(minimum=1, maximum=1000, value=1000, step=1, label="Max New Tokens") top_p = gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)") top_k = gr.Slider(minimum=1, maximum=50, value=50, step=1, label="Top-k") temperature = gr.Slider(minimum=0.1, maximum=5.0, value=0.8, step=0.1, label="Temperature") with gr.Column(scale=7): model_output = gr.Chatbot(label="Chatbot Output", height=566) def handle_submit(text, top_p, temperature, top_k, max_new_tokens): response = run_generation(text, top_p, temperature, top_k, max_new_tokens) return [(text, response)] button_submit.click(handle_submit, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) user_text.submit(handle_submit, [user_text, top_p, temperature, top_k, max_new_tokens], model_output) demo.queue(max_size=32).launch(server_name="0.0.0.0", server_port=7860)