Spaces:
Running
Running
File size: 4,088 Bytes
657585b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import torch
from transformers import AutoTokenizer, AutoConfig
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import gradio as gr
from gradio_helper import make_demo
from llm_config import SUPPORTED_LLM_MODELS
from pathlib import Path
# Define model configuration
model_language = "en" # Example: set to English
model_id = "qwen2.5-0.5b-instruct" # Example model ID
# Define model directories
pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"]
int4_model_dir = Path(model_id) / "INT4_compressed_weights"
# Load tokenizer
tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
# Ensure INT4 weights exist; if not, attempt conversion (locally before deployment)
def check_and_convert_model():
if not (int4_model_dir / "openvino_model.xml").exists():
print("INT4 model weights not found. Attempting compression...")
convert_to_int4()
def convert_to_int4():
"""
Converts a model to INT4 precision using the optimum-cli tool.
This function should only be run locally or in an environment that supports shell commands.
"""
# Define compression parameters
compression_configs = {
"qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
"default": {"sym": False, "group_size": 128, "ratio": 0.8},
}
model_compression_params = compression_configs.get(model_id, compression_configs["default"])
# Check if the INT4 model already exists
if (int4_model_dir / "openvino_model.xml").exists():
print("INT4 model already exists.")
return # Exit if the model is already converted
# Run model compression using `optimum-cli`
export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
if model_compression_params["sym"]:
int4_compression_args += " --sym"
# You can add other custom compression arguments here (like AWQ)
export_command = export_command_base + int4_compression_args
print(f"Running compression command: {export_command}")
# Execute the export command (this is typically done locally, not in Hugging Face Spaces)
# For deployment, the model needs to be pre-compressed and uploaded
os.system(export_command)
# Check if the INT4 model exists or needs conversion
check_and_convert_model()
# Initialize OpenVINO model
core = ov.Core()
ov_model = OVModelForCausalLM.from_pretrained(
str(int4_model_dir),
device="CPU", # Adjust device as needed (e.g., "GPU" or "CPU")
config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True),
trust_remote_code=True,
)
def convert_history_to_token(history):
"""
Convert the history of the conversation into tokens for the model.
"""
input_ids = tok.encode(history[-1][0]) # Example tokenization
return torch.LongTensor([input_ids])
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
"""
Bot logic to process conversation history and generate responses.
"""
input_ids = convert_history_to_token(history)
streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids,
max_new_tokens=256,
temperature=temperature,
do_sample=temperature > 0.0,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
streamer=streamer,
)
# Generate response
ov_model.generate(**generate_kwargs)
# Stream and update history with generated response
partial_text = ""
for new_text in streamer:
partial_text += new_text
history[-1][1] = partial_text
yield history
# Gradio interface setup
demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en")
demo.launch(debug=True, share=True)
|