import os import torch from transformers import AutoTokenizer, AutoConfig from optimum.intel.openvino import OVModelForCausalLM import openvino as ov import gradio as gr from gradio_helper import make_demo from llm_config import SUPPORTED_LLM_MODELS from pathlib import Path # Define model configuration model_language = "en" # Example: set to English model_id = "qwen2.5-0.5b-instruct" # Example model ID # Define model directories pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"] int4_model_dir = Path(model_id) / "INT4_compressed_weights" # Load tokenizer tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True) # Ensure INT4 weights exist; if not, attempt conversion (locally before deployment) def check_and_convert_model(): if not (int4_model_dir / "openvino_model.xml").exists(): print("INT4 model weights not found. Attempting compression...") convert_to_int4() def convert_to_int4(): """ Converts a model to INT4 precision using the optimum-cli tool. This function should only be run locally or in an environment that supports shell commands. """ # Define compression parameters compression_configs = { "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, "default": {"sym": False, "group_size": 128, "ratio": 0.8}, } model_compression_params = compression_configs.get(model_id, compression_configs["default"]) # Check if the INT4 model already exists if (int4_model_dir / "openvino_model.xml").exists(): print("INT4 model already exists.") return # Exit if the model is already converted # Run model compression using `optimum-cli` export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4" int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}" if model_compression_params["sym"]: int4_compression_args += " --sym" # You can add other custom compression arguments here (like AWQ) export_command = export_command_base + int4_compression_args print(f"Running compression command: {export_command}") # Execute the export command (this is typically done locally, not in Hugging Face Spaces) # For deployment, the model needs to be pre-compressed and uploaded os.system(export_command) # Check if the INT4 model exists or needs conversion check_and_convert_model() # Initialize OpenVINO model core = ov.Core() ov_model = OVModelForCausalLM.from_pretrained( str(int4_model_dir), device="CPU", # Adjust device as needed (e.g., "GPU" or "CPU") config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True), trust_remote_code=True, ) def convert_history_to_token(history): """ Convert the history of the conversation into tokens for the model. """ input_ids = tok.encode(history[-1][0]) # Example tokenization return torch.LongTensor([input_ids]) def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): """ Bot logic to process conversation history and generate responses. """ input_ids = convert_history_to_token(history) streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, max_new_tokens=256, temperature=temperature, do_sample=temperature > 0.0, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, streamer=streamer, ) # Generate response ov_model.generate(**generate_kwargs) # Stream and update history with generated response partial_text = "" for new_text in streamer: partial_text += new_text history[-1][1] = partial_text yield history # Gradio interface setup demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en") demo.launch(debug=True, share=True)