import os import torch import gradio as gr from pathlib import Path from transformers import AutoConfig, AutoTokenizer from optimum.intel.openvino import OVModelForCausalLM from typing import List, Tuple from threading import Event, Thread from gradio_helper import make_demo # Your helper function for Gradio demo from llm_config import SUPPORTED_LLM_MODELS # Model configuration from notebook_utils import device_widget # Device selection utility import openvino as ov import openvino.properties as props import openvino.properties.hint as hints import openvino.properties.streams as streams import requests # Define the model loading function (same as in your notebook) def convert_to_int4(model_id, model_configuration, enable_awq=False): compression_configs = { "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, "default": {"sym": False, "group_size": 128, "ratio": 0.8}, } model_compression_params = compression_configs.get(model_id, compression_configs["default"]) # Example conversion logic int4_model_dir = Path(model_id) / "INT4_compressed_weights" if (int4_model_dir / "openvino_model.xml").exists(): return int4_model_dir remote_code = model_configuration.get("remote_code", False) export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4" int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}" if model_compression_params["sym"]: int4_compression_args += " --sym" if enable_awq: int4_compression_args += " --awq --dataset wikitext2 --num-samples 128" export_command_base += int4_compression_args if remote_code: export_command_base += " --trust-remote-code" export_command = export_command_base + f" {str(int4_model_dir)}" # Execute export command (shell command) os.system(export_command) return int4_model_dir # Model and tokenizer loading def load_model(model_dir, device): ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} core = ov.Core() model_name = model_configuration["model_id"] tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) ov_model = OVModelForCausalLM.from_pretrained( model_dir, device=device, ov_config=ov_config, config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), trust_remote_code=True, ) return ov_model, tok # Gradio Interface for Bot interaction def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): input_ids = convert_history_to_token(history) if input_ids.shape[1] > 2000: history = [history[-1]] # Limit input size input_ids = convert_history_to_token(history) streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, max_new_tokens=256, temperature=temperature, do_sample=temperature > 0.0, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, streamer=streamer, ) # Function to generate response in a separate thread def generate_and_signal_complete(): ov_model.generate(**generate_kwargs) stream_complete.set() t1 = Thread(target=generate_and_signal_complete) t1.start() # Process partial text and return updated history partial_text = "" for new_text in streamer: partial_text = text_processor(partial_text, new_text) history[-1][1] = partial_text yield history # Define a Gradio interface for user interaction def create_gradio_interface(): # Dropdown for selecting model language and model ID model_language = list(SUPPORTED_LLM_MODELS.keys()) # List of model languages model_id = gr.Dropdown(choices=model_language, value=model_language[0], label="Model Language") # Once model language is selected, show the respective model IDs def update_model_ids(model_language): model_ids = list(SUPPORTED_LLM_MODELS[model_language].keys()) return gr.Dropdown.update(choices=model_ids, value=model_ids[0]) model_id_selector = gr.Dropdown(choices=model_language, value=model_language[0], label="Model ID") model_id_selector.change(update_model_ids, inputs=model_language, outputs=model_id_selector) # Set up a checkbox for enabling AWQ compression enable_awq = gr.Checkbox(value=False, label="Enable AWQ for Compression") # Initialize model selection based on language and ID def load_model_on_select(model_language, model_id, enable_awq): model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id] int4_model_dir = convert_to_int4(model_id, model_configuration, enable_awq) # Load the model and tokenizer device = device_widget("CPU") # or any device you want to use ov_model, tok = load_model(int4_model_dir, device) # Return the loaded model and tokenizer return ov_model, tok # Connect model selection UI to load model dynamically load_button = gr.Button("Load Model") load_button.click(load_model_on_select, inputs=[model_language, model_id, enable_awq], outputs=[gr.Textbox(label="Model Status")]) # Create the Gradio chatbot interface chatbot = gr.Chatbot() # Parameters for bot generation temperature = gr.Slider(minimum=0, maximum=1, step=0.1, label="Temperature", value=0.7) top_p = gr.Slider(minimum=0, maximum=1, step=0.1, label="Top-p", value=0.9) top_k = gr.Slider(minimum=0, maximum=50, step=1, label="Top-k", value=50) repetition_penalty = gr.Slider(minimum=0, maximum=2, step=0.1, label="Repetition Penalty", value=1.0) # Run the Gradio interface demo = gr.Interface( fn=bot, inputs=[chatbot, temperature, top_p, top_k, repetition_penalty], outputs=[chatbot], title="OpenVINO Chatbot", live=True ) return demo # Run the Gradio app if __name__ == "__main__": app = create_gradio_interface() app.launch(debug=True, share=True) # share=True for public access