File size: 7,465 Bytes
321ea15
 
 
 
 
 
 
 
 
 
 
 
 
 
176be12
321ea15
 
 
176be12
321ea15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176be12
321ea15
176be12
321ea15
 
176be12
321ea15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176be12
321ea15
 
 
176be12
321ea15
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import threading
import http.server
import socketserver
import os
import yaml
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import gradio as gr
from utils.upload_file import UploadFile
from utils.chatbot import ChatBot
from utils.ui_settings import UISettings
from utils.load_config import LoadConfig
from pyprojroot import here

# Load the app config
with open(here("configs/app_config.yml")) as cfg:
    app_config = yaml.load(cfg, Loader=yaml.FullLoader)

PORT = app_config["serve"]["port"]
DIRECTORY1 = app_config["directories"]["data_directory"]
DIRECTORY2 = app_config["directories"]["data_directory_2"]

# ================================
# Part 1: Reference Serve Code
# ================================
class MultiDirectoryHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
    """Serve files from multiple directories."""
    def translate_path(self, path):
        parts = path.split('/', 2)
        if len(parts) > 1:
            first_directory = parts[1]
            if first_directory == os.path.basename(DIRECTORY1):
                path = os.path.join(DIRECTORY1, *parts[2:])
            elif first_directory == os.path.basename(DIRECTORY2):
                path = os.path.join(DIRECTORY2, *parts[2:])
            else:
                file_path1 = os.path.join(DIRECTORY1, first_directory)
                file_path2 = os.path.join(DIRECTORY2, first_directory)
                if os.path.isfile(file_path1):
                    return file_path1
                elif os.path.isfile(file_path2):
                    return file_path2
        return super().translate_path(path)

def start_reference_server():
    with socketserver.TCPServer(("", PORT), MultiDirectoryHTTPRequestHandler) as httpd:
        print(f"Serving at port {PORT}")
        httpd.serve_forever()

# ================================
# Part 2: LLM Serve Code
# ================================
APPCFG = LoadConfig()

app = Flask(__name__)

# Load the LLM and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    APPCFG.llm_engine, token=APPCFG.gemma_token, device=APPCFG.device)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="BioMistral/BioMistral-7B",
                                             token=APPCFG.gemma_token,
                                             torch_dtype=torch.float16,
                                             device_map=APPCFG.device)
app_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

@app.route("/generate_text", methods=["POST"])
def generate_text():
    data = request.json
    prompt = data.get("prompt", "")
    max_new_tokens = data.get("max_new_tokens", 1000)
    do_sample = data.get("do_sample", True)
    temperature = data.get("temperature", 0.1)
    top_k = data.get("top_k", 50)
    top_p = data.get("top_p", 0.95)

    tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
        prompt, tokenize=False, add_generation_prompt=True)
    outputs = app_pipeline(
        tokenized_prompt,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )

    return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]})

def start_llm_server():
    app.run(debug=False, port=8888)

# ================================
# Part 3: Gradio Chatbot Code
# ================================
def start_gradio_app():
    with gr.Blocks() as demo:
        with gr.Tabs():
            with gr.TabItem("Med-App"):
                # First row
                with gr.Row() as row_one:
                    with gr.Column(visible=False) as reference_bar:
                        ref_output = gr.Markdown()
                    with gr.Column() as chatbot_output:
                        chatbot = gr.Chatbot(
                            [], elem_id="chatbot", bubble_full_width=False, height=500,
                            avatar_images=("images/test.png", "images/Gemma-logo.png")
                        )
                        chatbot.like(UISettings.feedback, None, None)

                # Second row
                with gr.Row():
                    input_txt = gr.Textbox(
                        lines=4, scale=8, placeholder="Enter text and press enter, or upload PDF files"
                    )

                # Third row
                with gr.Row() as row_two:
                    text_submit_btn = gr.Button(value="Submit text")
                    btn_toggle_sidebar = gr.Button(value="References")
                    upload_btn = gr.UploadButton(
                        "πŸ“ Upload PDF or doc files", file_types=['.pdf', '.doc'], file_count="multiple"
                    )
                    clear_button = gr.ClearButton([input_txt, chatbot])
                    rag_with_dropdown = gr.Dropdown(
                        label="RAG with", choices=["Preprocessed doc", "Upload doc: Process for RAG"], value="Preprocessed doc"
                    )

                # Fourth row
                with gr.Row() as row_four:
                    temperature_bar = gr.Slider(
                        minimum=0.1, maximum=1, value=0.1, step=0.1, label="Temperature",
                        info="Increasing the temperature will make the model answer more creatively."
                    )
                    top_k = gr.Slider(
                        minimum=0.0, maximum=100.0, step=1, label="top_k", value=50,
                        info="A lower value (e.g. 10) will result in more conservative answers."
                    )
                    top_p = gr.Slider(
                        minimum=0.0, maximum=1.0, step=0.01, label="top_p", value=0.95,
                        info="A lower value will generate more focused and conservative text."
                    )

                # Process uploaded files and text
                file_msg = upload_btn.upload(
                    fn=UploadFile.process_uploaded_files, inputs=[upload_btn, chatbot, rag_with_dropdown],
                    outputs=[input_txt, chatbot], queue=False
                )
                txt_msg = input_txt.submit(
                    fn=ChatBot.respond, inputs=[chatbot, input_txt, rag_with_dropdown, temperature_bar, top_k, top_p],
                    outputs=[input_txt, chatbot, ref_output], queue=False
                ).then(lambda: gr.Textbox(interactive=True), None, [input_txt], queue=False)
                text_submit_btn.click(
                    fn=ChatBot.respond, inputs=[chatbot, input_txt, rag_with_dropdown, temperature_bar, top_k, top_p],
                    outputs=[input_txt, chatbot, ref_output], queue=False
                ).then(lambda: gr.Textbox(interactive=True), None, [input_txt], queue=False)

    demo.launch()

# ================================
# Main: Running all services concurrently
# ================================
if __name__ == "__main__":
    # Start all services in separate threads
    reference_server_thread = threading.Thread(target=start_reference_server)
    llm_server_thread = threading.Thread(target=start_llm_server)
    gradio_app_thread = threading.Thread(target=start_gradio_app)

    reference_server_thread.start()
    llm_server_thread.start()
    gradio_app_thread.start()

    # Keep the main thread alive
    reference_server_thread.join()
    llm_server_thread.join()
    gradio_app_thread.join()