Spaces:
Running
Running
import os | |
import torch | |
from transformers import AutoTokenizer, AutoConfig | |
from optimum.intel.openvino import OVModelForCausalLM | |
import openvino as ov | |
import gradio as gr | |
from gradio_helper import make_demo | |
from llm_config import SUPPORTED_LLM_MODELS | |
from pathlib import Path | |
# Define model configuration | |
model_language = "en" # Example: set to English | |
model_id = "qwen2.5-0.5b-instruct" # Example model ID | |
# Define model directories | |
pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"] | |
int4_model_dir = Path(model_id) / "INT4_compressed_weights" | |
# Load tokenizer | |
tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True) | |
# Ensure INT4 weights exist; if not, attempt conversion (locally before deployment) | |
def check_and_convert_model(): | |
if not (int4_model_dir / "openvino_model.xml").exists(): | |
print("INT4 model weights not found. Attempting compression...") | |
convert_to_int4() | |
def convert_to_int4(): | |
""" | |
Converts a model to INT4 precision using the optimum-cli tool. | |
This function should only be run locally or in an environment that supports shell commands. | |
""" | |
# Define compression parameters | |
compression_configs = { | |
"qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, | |
"default": {"sym": False, "group_size": 128, "ratio": 0.8}, | |
} | |
model_compression_params = compression_configs.get(model_id, compression_configs["default"]) | |
# Check if the INT4 model already exists | |
if (int4_model_dir / "openvino_model.xml").exists(): | |
print("INT4 model already exists.") | |
return # Exit if the model is already converted | |
# Run model compression using `optimum-cli` | |
export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4" | |
int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}" | |
if model_compression_params["sym"]: | |
int4_compression_args += " --sym" | |
# You can add other custom compression arguments here (like AWQ) | |
export_command = export_command_base + int4_compression_args | |
print(f"Running compression command: {export_command}") | |
# Execute the export command (this is typically done locally, not in Hugging Face Spaces) | |
# For deployment, the model needs to be pre-compressed and uploaded | |
os.system(export_command) | |
# Check if the INT4 model exists or needs conversion | |
check_and_convert_model() | |
# Initialize OpenVINO model | |
core = ov.Core() | |
ov_model = OVModelForCausalLM.from_pretrained( | |
str(int4_model_dir), | |
device="CPU", # Adjust device as needed (e.g., "GPU" or "CPU") | |
config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True), | |
trust_remote_code=True, | |
) | |
def convert_history_to_token(history): | |
""" | |
Convert the history of the conversation into tokens for the model. | |
""" | |
input_ids = tok.encode(history[-1][0]) # Example tokenization | |
return torch.LongTensor([input_ids]) | |
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): | |
""" | |
Bot logic to process conversation history and generate responses. | |
""" | |
input_ids = convert_history_to_token(history) | |
streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) | |
generate_kwargs = dict( | |
input_ids=input_ids, | |
max_new_tokens=256, | |
temperature=temperature, | |
do_sample=temperature > 0.0, | |
top_p=top_p, | |
top_k=top_k, | |
repetition_penalty=repetition_penalty, | |
streamer=streamer, | |
) | |
# Generate response | |
ov_model.generate(**generate_kwargs) | |
# Stream and update history with generated response | |
partial_text = "" | |
for new_text in streamer: | |
partial_text += new_text | |
history[-1][1] = partial_text | |
yield history | |
# Gradio interface setup | |
demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en") | |
demo.launch(debug=True, share=True) | |