File size: 4,088 Bytes
657585b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import torch
from transformers import AutoTokenizer, AutoConfig
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import gradio as gr
from gradio_helper import make_demo
from llm_config import SUPPORTED_LLM_MODELS
from pathlib import Path

# Define model configuration
model_language = "en"  # Example: set to English
model_id = "qwen2.5-0.5b-instruct"  # Example model ID

# Define model directories
pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"]
int4_model_dir = Path(model_id) / "INT4_compressed_weights"

# Load tokenizer
tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)

# Ensure INT4 weights exist; if not, attempt conversion (locally before deployment)
def check_and_convert_model():
    if not (int4_model_dir / "openvino_model.xml").exists():
        print("INT4 model weights not found. Attempting compression...")
        convert_to_int4()

def convert_to_int4():
    """
    Converts a model to INT4 precision using the optimum-cli tool.
    This function should only be run locally or in an environment that supports shell commands.
    """
    # Define compression parameters
    compression_configs = {
        "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
    }

    model_compression_params = compression_configs.get(model_id, compression_configs["default"])

    # Check if the INT4 model already exists
    if (int4_model_dir / "openvino_model.xml").exists():
        print("INT4 model already exists.")
        return  # Exit if the model is already converted

    # Run model compression using `optimum-cli`
    export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
    int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
    if model_compression_params["sym"]:
        int4_compression_args += " --sym"
    
    # You can add other custom compression arguments here (like AWQ)
    export_command = export_command_base + int4_compression_args
    print(f"Running compression command: {export_command}")

    # Execute the export command (this is typically done locally, not in Hugging Face Spaces)
    # For deployment, the model needs to be pre-compressed and uploaded
    os.system(export_command)

# Check if the INT4 model exists or needs conversion
check_and_convert_model()

# Initialize OpenVINO model
core = ov.Core()
ov_model = OVModelForCausalLM.from_pretrained(
    str(int4_model_dir),
    device="CPU",  # Adjust device as needed (e.g., "GPU" or "CPU")
    config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True),
    trust_remote_code=True,
)

def convert_history_to_token(history):
    """
    Convert the history of the conversation into tokens for the model.
    """
    input_ids = tok.encode(history[-1][0])  # Example tokenization
    return torch.LongTensor([input_ids])

def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
    """
    Bot logic to process conversation history and generate responses.
    """
    input_ids = convert_history_to_token(history)
    streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=256,
        temperature=temperature,
        do_sample=temperature > 0.0,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
    )
    
    # Generate response
    ov_model.generate(**generate_kwargs)
    
    # Stream and update history with generated response
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        history[-1][1] = partial_text
        yield history

# Gradio interface setup
demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en")
demo.launch(debug=True, share=True)