File size: 6,830 Bytes
657585b
 
 
210ec4a
 
 
fb42888
 
210ec4a
 
 
 
 
 
 
 
657585b
210ec4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657585b
210ec4a
 
 
 
 
 
657585b
210ec4a
 
 
 
 
 
 
657585b
210ec4a
657585b
c6b2b77
657585b
 
210ec4a
 
 
 
657585b
210ec4a
657585b
 
 
 
 
 
 
 
 
 
210ec4a
 
 
 
 
 
 
 
 
 
657585b
 
210ec4a
657585b
 
 
c6b2b77
210ec4a
c6b2b77
 
 
210ec4a
c6b2b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7fc772b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210ec4a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import torch
import gradio as gr
from pathlib import Path
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
from typing import List, Tuple
from threading import Event, Thread
from gradio_helper import make_demo  # Your helper function for Gradio demo
from llm_config import SUPPORTED_LLM_MODELS  # Model configuration
from notebook_utils import device_widget  # Device selection utility
import openvino as ov
import openvino.properties as props
import openvino.properties.hint as hints
import openvino.properties.streams as streams
import requests

# Define the model loading function (same as in your notebook)
def convert_to_int4(model_id, model_configuration, enable_awq=False):
    compression_configs = {
        "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
    }
    model_compression_params = compression_configs.get(model_id, compression_configs["default"])
    
    # Example conversion logic
    int4_model_dir = Path(model_id) / "INT4_compressed_weights"
    if (int4_model_dir / "openvino_model.xml").exists():
        return int4_model_dir
    remote_code = model_configuration.get("remote_code", False)
    export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
    int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
    if model_compression_params["sym"]:
        int4_compression_args += " --sym"
    if enable_awq:
        int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
    export_command_base += int4_compression_args
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + f" {str(int4_model_dir)}"
    
    # Execute export command (shell command)
    os.system(export_command)
    return int4_model_dir

# Model and tokenizer loading
def load_model(model_dir, device):
    ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
    core = ov.Core()
    model_name = model_configuration["model_id"]
    tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

    ov_model = OVModelForCausalLM.from_pretrained(
        model_dir,
        device=device,
        ov_config=ov_config,
        config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
        trust_remote_code=True,
    )

    return ov_model, tok

# Gradio Interface for Bot interaction
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
    input_ids = convert_history_to_token(history)
    if input_ids.shape[1] > 2000:
        history = [history[-1]]  # Limit input size
        input_ids = convert_history_to_token(history)
    
    streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
    
    generate_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=256,
        temperature=temperature,
        do_sample=temperature > 0.0,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
    )

    # Function to generate response in a separate thread
    def generate_and_signal_complete():
        ov_model.generate(**generate_kwargs)
        stream_complete.set()

    t1 = Thread(target=generate_and_signal_complete)
    t1.start()

    # Process partial text and return updated history
    partial_text = ""
    for new_text in streamer:
        partial_text = text_processor(partial_text, new_text)
        history[-1][1] = partial_text
        yield history

# Define a Gradio interface for user interaction
def create_gradio_interface():
    # Dropdown for selecting model language and model ID
    model_language = list(SUPPORTED_LLM_MODELS.keys())  # List of model languages
    model_id = gr.Dropdown(choices=model_language, value=model_language[0], label="Model Language")
    
    # Once model language is selected, show the respective model IDs
    def update_model_ids(model_language):
        model_ids = list(SUPPORTED_LLM_MODELS[model_language].keys())
        return gr.Dropdown.update(choices=model_ids, value=model_ids[0])

    model_id_selector = gr.Dropdown(choices=model_language, value=model_language[0], label="Model ID")

    # Set up a checkbox for enabling AWQ compression
    enable_awq = gr.Checkbox(value=False, label="Enable AWQ for Compression")

    # Initialize model selection based on language and ID
    def load_model_on_select(model_language, model_id, enable_awq):
        model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
        int4_model_dir = convert_to_int4(model_id, model_configuration, enable_awq)
        
        # Load the model and tokenizer
        device = device_widget("CPU")  # or any device you want to use
        ov_model, tok = load_model(int4_model_dir, device)
        
        # Return the loaded model and tokenizer
        return ov_model, tok

    # Create the Gradio chatbot interface
    chatbot = gr.Chatbot()

    # Parameters for bot generation
    temperature = gr.Slider(minimum=0, maximum=1, step=0.1, label="Temperature", value=0.7)
    top_p = gr.Slider(minimum=0, maximum=1, step=0.1, label="Top-p", value=0.9)
    top_k = gr.Slider(minimum=0, maximum=50, step=1, label="Top-k", value=50)
    repetition_penalty = gr.Slider(minimum=0, maximum=2, step=0.1, label="Repetition Penalty", value=1.0)

    with gr.Blocks() as demo:
        # Create the Gradio components and add them to the Blocks context
        model_id_selector.change(update_model_ids, inputs=model_language, outputs=model_id_selector)
        load_button = gr.Button("Load Model")
        load_button.click(load_model_on_select, inputs=[model_language, model_id, enable_awq], outputs=[gr.Textbox(label="Model Status")])

        # Set up the chatbot UI with all the required components
        gr.Row([model_id_selector, enable_awq])  # Arrange the dropdowns and checkbox in a row
        gr.Row([load_button])  # Add the button below the inputs
        gr.Row([chatbot])  # Add the chatbot output
        
        # Parameters for generation
        gr.Row([temperature, top_p, top_k, repetition_penalty])  # Add sliders in a row

        # Define bot function and run the interface
        demo.queue()  # This is used to queue inputs and outputs, handling concurrent generation calls
        demo.launch(debug=True, share=True)  # For public access

    return demo

# Run the Gradio app
if __name__ == "__main__":
    app = create_gradio_interface()
    app.launch(debug=True, share=True)  # share=True for public access