File size: 4,680 Bytes
657585b
210ec4a
d8164ce
 
 
 
210ec4a
 
 
 
 
 
d8164ce
657585b
d8164ce
 
 
 
 
 
 
 
 
 
 
 
 
 
210ec4a
d8164ce
 
 
 
 
 
 
4acb2ad
210ec4a
657585b
d8164ce
 
 
210ec4a
 
d8164ce
 
 
 
210ec4a
 
 
d8164ce
210ec4a
 
d8164ce
210ec4a
d8164ce
 
657585b
d8164ce
 
 
4acb2ad
d8164ce
 
657585b
 
 
 
 
 
 
d8164ce
657585b
d8164ce
 
210ec4a
 
d8164ce
4acb2ad
210ec4a
 
4acb2ad
657585b
 
d8164ce
657585b
 
 
4acb2ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210ec4a
4acb2ad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
from pathlib import Path
import requests
import shutil
import torch
from threading import Event, Thread
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import openvino.properties as props
import openvino.properties.hint as hints
import openvino.properties.streams as streams
import gradio as gr

from llm_config import SUPPORTED_LLM_MODELS
from notebook_utils import device_widget

# Initialize model language options
model_languages = list(SUPPORTED_LLM_MODELS)

def update_model_id(model_language_value):
    model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
    return model_ids[0], gr.update(choices=model_ids)

# Function to download the model if not already present
def download_model_if_needed(model_language_value, model_id_value):
    model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
    int4_weights = int4_model_dir / "openvino_model.bin"
    
    if not int4_weights.exists():
        print(f"Downloading model {model_id_value}...")
        # Add your download logic here (e.g., from a URL)
        # Example:
        # r = requests.get(model_configuration["model_url"])
        # with open(int4_weights, "wb") as f:
        #     f.write(r.content)
    
    return int4_model_dir

# Load the model
def load_model(model_language_value, model_id_value):
    int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
    ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
    core = ov.Core()
    
    model_dir = int4_model_dir
    model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
    
    tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    ov_model = OVModelForCausalLM.from_pretrained(
        model_dir,
        device=device.value,
        ov_config=ov_config,
        config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
        trust_remote_code=True
    )
    
    return tok, ov_model, model_configuration

# Gradio interface function for generating text responses
def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
    tok, ov_model, model_configuration = load_model(model_language_value, model_id_value)
    input_ids = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
    streamer = gr.Textbox.update()

    generate_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=256,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        streamer=streamer
    )
    
    event = Event()
    def generate_and_signal_complete():
        ov_model.generate(**generate_kwargs)
        event.set()
    
    t1 = Thread(target=generate_and_signal_complete)
    t1.start()
    
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        history[-1][1] = partial_text
        yield history

# Gradio UI within a Blocks context
with gr.Blocks() as iface:
    model_language = gr.Dropdown(
        choices=model_languages,
        value=model_languages[0],
        label="Model Language"
    )

    model_id = gr.Dropdown(
        choices=[],  # dynamically populated
        label="Model",
        value=None
    )

    model_language.change(update_model_id, inputs=model_language, outputs=[model_id])

    prepare_int4_model = gr.Checkbox(
        value=True,
        label="Prepare INT4 Model"
    )
    enable_awq = gr.Checkbox(
        value=False,
        label="Enable AWQ",
        visible=False
    )

    device = device_widget("CPU", exclude=["NPU"])

    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
    top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
    repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
    
    history = gr.State([])

    iface_interface = gr.Interface(
        fn=generate_response,
        inputs=[
            history, 
            temperature, 
            top_p, 
            top_k, 
            repetition_penalty, 
            model_language, 
            model_id
        ],
        outputs=[gr.Textbox(label="Conversation History")],
        live=True,
        title="OpenVINO Chatbot"
    )

    iface_interface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)

if __name__ == "__main__":
    iface.launch()