Spaces:

lightmate
/

llm-chatbot

Running

File size: 4,088 Bytes

657585b

import os
import torch
from transformers import AutoTokenizer, AutoConfig
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import gradio as gr
from gradio_helper import make_demo
from llm_config import SUPPORTED_LLM_MODELS
from pathlib import Path

# Define model configuration
model_language = "en"  # Example: set to English
model_id = "qwen2.5-0.5b-instruct"  # Example model ID

# Define model directories
pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"]
int4_model_dir = Path(model_id) / "INT4_compressed_weights"

# Load tokenizer
tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)

# Ensure INT4 weights exist; if not, attempt conversion (locally before deployment)
def check_and_convert_model():
    if not (int4_model_dir / "openvino_model.xml").exists():
        print("INT4 model weights not found. Attempting compression...")
        convert_to_int4()

def convert_to_int4():
    """
    Converts a model to INT4 precision using the optimum-cli tool.
    This function should only be run locally or in an environment that supports shell commands.
    """
    # Define compression parameters
    compression_configs = {
        "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
    }

    model_compression_params = compression_configs.get(model_id, compression_configs["default"])

    # Check if the INT4 model already exists
    if (int4_model_dir / "openvino_model.xml").exists():
        print("INT4 model already exists.")
        return  # Exit if the model is already converted

    # Run model compression using `optimum-cli`
    export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
    int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
    if model_compression_params["sym"]:
        int4_compression_args += " --sym"
    
    # You can add other custom compression arguments here (like AWQ)
    export_command = export_command_base + int4_compression_args
    print(f"Running compression command: {export_command}")

    # Execute the export command (this is typically done locally, not in Hugging Face Spaces)
    # For deployment, the model needs to be pre-compressed and uploaded
    os.system(export_command)

# Check if the INT4 model exists or needs conversion
check_and_convert_model()

# Initialize OpenVINO model
core = ov.Core()
ov_model = OVModelForCausalLM.from_pretrained(
    str(int4_model_dir),
    device="CPU",  # Adjust device as needed (e.g., "GPU" or "CPU")
    config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True),
    trust_remote_code=True,
)

def convert_history_to_token(history):
    """
    Convert the history of the conversation into tokens for the model.
    """
    input_ids = tok.encode(history[-1][0])  # Example tokenization
    return torch.LongTensor([input_ids])

def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
    """
    Bot logic to process conversation history and generate responses.
    """
    input_ids = convert_history_to_token(history)
    streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=256,
        temperature=temperature,
        do_sample=temperature > 0.0,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
    )
    
    # Generate response
    ov_model.generate(**generate_kwargs)
    
    # Stream and update history with generated response
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        history[-1][1] = partial_text
        yield history

# Gradio interface setup
demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en")
demo.launch(debug=True, share=True)