llm-chatbot / app.py
lightmate's picture
Upload 5 files
657585b verified
raw
history blame
4.09 kB
import os
import torch
from transformers import AutoTokenizer, AutoConfig
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import gradio as gr
from gradio_helper import make_demo
from llm_config import SUPPORTED_LLM_MODELS
from pathlib import Path
# Define model configuration
model_language = "en" # Example: set to English
model_id = "qwen2.5-0.5b-instruct" # Example model ID
# Define model directories
pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"]
int4_model_dir = Path(model_id) / "INT4_compressed_weights"
# Load tokenizer
tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
# Ensure INT4 weights exist; if not, attempt conversion (locally before deployment)
def check_and_convert_model():
if not (int4_model_dir / "openvino_model.xml").exists():
print("INT4 model weights not found. Attempting compression...")
convert_to_int4()
def convert_to_int4():
"""
Converts a model to INT4 precision using the optimum-cli tool.
This function should only be run locally or in an environment that supports shell commands.
"""
# Define compression parameters
compression_configs = {
"qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
"default": {"sym": False, "group_size": 128, "ratio": 0.8},
}
model_compression_params = compression_configs.get(model_id, compression_configs["default"])
# Check if the INT4 model already exists
if (int4_model_dir / "openvino_model.xml").exists():
print("INT4 model already exists.")
return # Exit if the model is already converted
# Run model compression using `optimum-cli`
export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
if model_compression_params["sym"]:
int4_compression_args += " --sym"
# You can add other custom compression arguments here (like AWQ)
export_command = export_command_base + int4_compression_args
print(f"Running compression command: {export_command}")
# Execute the export command (this is typically done locally, not in Hugging Face Spaces)
# For deployment, the model needs to be pre-compressed and uploaded
os.system(export_command)
# Check if the INT4 model exists or needs conversion
check_and_convert_model()
# Initialize OpenVINO model
core = ov.Core()
ov_model = OVModelForCausalLM.from_pretrained(
str(int4_model_dir),
device="CPU", # Adjust device as needed (e.g., "GPU" or "CPU")
config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True),
trust_remote_code=True,
)
def convert_history_to_token(history):
"""
Convert the history of the conversation into tokens for the model.
"""
input_ids = tok.encode(history[-1][0]) # Example tokenization
return torch.LongTensor([input_ids])
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
"""
Bot logic to process conversation history and generate responses.
"""
input_ids = convert_history_to_token(history)
streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids,
max_new_tokens=256,
temperature=temperature,
do_sample=temperature > 0.0,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
streamer=streamer,
)
# Generate response
ov_model.generate(**generate_kwargs)
# Stream and update history with generated response
partial_text = ""
for new_text in streamer:
partial_text += new_text
history[-1][1] = partial_text
yield history
# Gradio interface setup
demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en")
demo.launch(debug=True, share=True)