torVik commited on
Commit
098aed9
·
verified ·
1 Parent(s): b6d6110

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +371 -371
app.py CHANGED
@@ -1,371 +1,371 @@
1
- #!/usr/bin/env python
2
-
3
- import os
4
- from threading import Thread
5
- from typing import Iterator
6
-
7
- import gradio as gr
8
- import spaces
9
- import torch
10
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
-
12
- # Debugging: Start script
13
- print("Starting script...")
14
-
15
- HF_TOKEN = os.environ.get("HF_TOKEN")
16
- if HF_TOKEN is None:
17
- print("Warning: HF_TOKEN is not set!")
18
-
19
- PASSWORD = os.getenv("APP_PASSWORD", "mysecretpassword") # Set your desired password here or via environment variable
20
-
21
- DESCRIPTION = "# FT of Lama"
22
-
23
- if not torch.cuda.is_available():
24
- DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
25
- print("Warning: No GPU available. This model cannot run on CPU.")
26
- else:
27
- print("GPU is available!")
28
-
29
- MAX_MAX_NEW_TOKENS = 2048
30
- DEFAULT_MAX_NEW_TOKENS = 1024
31
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
32
-
33
- # Debugging: GPU check passed, loading model
34
- if torch.cuda.is_available():
35
- model_id = "BGLAW/llama-3-8b-Instruct-bglawinsv1UNS_merged"
36
- try:
37
- print("Loading model...")
38
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN)
39
- print("Model loaded successfully!")
40
-
41
- print("Loading tokenizer...")
42
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
43
- print("Tokenizer loaded successfully!")
44
- except Exception as e:
45
- print(f"Error loading model or tokenizer: {e}")
46
- raise e # Re-raise the error after logging it
47
-
48
-
49
- @spaces.GPU
50
- def generate(
51
- message: str,
52
- chat_history: list[tuple[str, str]],
53
- max_new_tokens: int = 1024,
54
- temperature: float = 0.6,
55
- top_p: float = 0.9,
56
- top_k: int = 50,
57
- repetition_penalty: float = 1.2,
58
- ) -> Iterator[str]:
59
- print(f"Received message: {message}")
60
- print(f"Chat history: {chat_history}")
61
-
62
- conversation = []
63
- for user, assistant in chat_history:
64
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
65
- conversation.append({"role": "user", "content": message})
66
-
67
- try:
68
- print("Tokenizing input...")
69
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
70
- print(f"Input tokenized: {input_ids.shape}")
71
-
72
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
73
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
74
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
75
- print("Trimmed input tokens due to length.")
76
-
77
- input_ids = input_ids.to(model.device)
78
- print("Input moved to the model's device.")
79
-
80
- streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
81
- generate_kwargs = dict(
82
- {"input_ids": input_ids},
83
- streamer=streamer,
84
- max_new_tokens=max_new_tokens,
85
- do_sample=True,
86
- top_p=top_p,
87
- top_k=top_k,
88
- temperature=temperature,
89
- num_beams=1,
90
- repetition_penalty=repetition_penalty,
91
- )
92
-
93
- print("Starting generation...")
94
- t = Thread(target=model.generate, kwargs=generate_kwargs)
95
- t.start()
96
- print("Thread started for model generation.")
97
-
98
- outputs = []
99
- for text in streamer:
100
- outputs.append(text)
101
- print(f"Generated text so far: {''.join(outputs)}")
102
- yield "".join(outputs)
103
-
104
- except Exception as e:
105
- print(f"Error during generation: {e}")
106
- raise e # Re-raise the error after logging it
107
-
108
-
109
- def password_auth(password):
110
- if password == PASSWORD:
111
- return gr.update(visible=True), gr.update(visible=False)
112
- else:
113
- return gr.update(visible=False), gr.update(visible=True, value="Incorrect password. Try again.")
114
-
115
- chat_interface = gr.ChatInterface(
116
- fn=generate,
117
- additional_inputs=[
118
- gr.Slider(
119
- label="Max new tokens",
120
- minimum=1,
121
- maximum=MAX_MAX_NEW_TOKENS,
122
- step=1,
123
- value=DEFAULT_MAX_NEW_TOKENS,
124
- ),
125
- gr.Slider(
126
- label="Temperature",
127
- minimum=0.1,
128
- maximum=4.0,
129
- step=0.1,
130
- value=0.6,
131
- ),
132
- gr.Slider(
133
- label="Top-p (nucleus sampling)",
134
- minimum=0.05,
135
- maximum=1.0,
136
- step=0.05,
137
- value=0.9,
138
- ),
139
- gr.Slider(
140
- label="Top-k",
141
- minimum=1,
142
- maximum=1000,
143
- step=1,
144
- value=50,
145
- ),
146
- gr.Slider(
147
- label="Repetition penalty",
148
- minimum=1.0,
149
- maximum=2.0,
150
- step=0.05,
151
- value=1.2,
152
- ),
153
- ],
154
- stop_btn=None,
155
- examples=[
156
- ["Hello there! How are you doing?"],
157
- ["Can you explain briefly to me what is the Python programming language?"],
158
- ["Explain the plot of Cinderella in a sentence."],
159
- ["How many hours does it take a man to eat a Helicopter?"],
160
- ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
161
- ],
162
- )
163
-
164
- # Debugging: Interface setup
165
- print("Setting up interface...")
166
-
167
- with gr.Blocks(css="style.css") as demo:
168
- gr.Markdown(DESCRIPTION)
169
-
170
- # Create login components
171
- with gr.Row(visible=True) as login_area:
172
- password_input = gr.Textbox(
173
- label="Enter Password", type="password", placeholder="Password", show_label=True
174
- )
175
- login_btn = gr.Button("Submit")
176
- incorrect_password_msg = gr.Markdown("Incorrect password. Try again.", visible=False)
177
-
178
- # Main chat interface
179
- with gr.Column(visible=False) as chat_area:
180
- gr.Markdown(DESCRIPTION)
181
- gr.DuplicateButton(
182
- value="Duplicate Space for private use",
183
- elem_id="duplicate-button",
184
- visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
185
- )
186
- chat_interface.render()
187
-
188
- # Bind login button to check password
189
- login_btn.click(password_auth, inputs=password_input, outputs=[chat_area, incorrect_password_msg])
190
-
191
- # Debugging: Starting queue and launching the demo
192
- print("Launching demo...")
193
-
194
- if __name__ == "__main__":
195
- demo.queue(max_size=20).launch(share=True)
196
-
197
-
198
-
199
- # WORKING
200
- # #!/usr/bin/env python
201
-
202
- # import os
203
- # from threading import Thread
204
- # from typing import Iterator
205
-
206
- # import gradio as gr
207
- # import spaces
208
- # import torch
209
- # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
210
-
211
- # # Debugging: Start script
212
- # print("Starting script...")
213
-
214
- # HF_TOKEN = os.environ.get("HF_TOKEN")
215
- # if HF_TOKEN is None:
216
- # print("Warning: HF_TOKEN is not set!")
217
-
218
- # DESCRIPTION = "# Mistral-7B v0.2"
219
-
220
- # if not torch.cuda.is_available():
221
- # DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
222
- # print("Warning: No GPU available. This model cannot run on CPU.")
223
- # else:
224
- # print("GPU is available!")
225
-
226
- # MAX_MAX_NEW_TOKENS = 2048
227
- # DEFAULT_MAX_NEW_TOKENS = 1024
228
- # MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
229
-
230
- # # Debugging: GPU check passed, loading model
231
- # if torch.cuda.is_available():
232
- # model_id = "mistralai/Mistral-7B-Instruct-v0.2"
233
- # try:
234
- # print("Loading model...")
235
- # model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN)
236
- # print("Model loaded successfully!")
237
-
238
- # print("Loading tokenizer...")
239
- # tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
240
- # print("Tokenizer loaded successfully!")
241
- # except Exception as e:
242
- # print(f"Error loading model or tokenizer: {e}")
243
- # raise e # Re-raise the error after logging it
244
-
245
-
246
- # @spaces.GPU
247
- # def generate(
248
- # message: str,
249
- # chat_history: list[tuple[str, str]],
250
- # max_new_tokens: int = 1024,
251
- # temperature: float = 0.6,
252
- # top_p: float = 0.9,
253
- # top_k: int = 50,
254
- # repetition_penalty: float = 1.2,
255
- # ) -> Iterator[str]:
256
- # print(f"Received message: {message}")
257
- # print(f"Chat history: {chat_history}")
258
-
259
- # conversation = []
260
- # for user, assistant in chat_history:
261
- # conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
262
- # conversation.append({"role": "user", "content": message})
263
-
264
- # try:
265
- # print("Tokenizing input...")
266
- # input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
267
- # print(f"Input tokenized: {input_ids.shape}")
268
-
269
- # if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
270
- # input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
271
- # gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
272
- # print("Trimmed input tokens due to length.")
273
-
274
- # input_ids = input_ids.to(model.device)
275
- # print("Input moved to the model's device.")
276
-
277
- # streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
278
- # generate_kwargs = dict(
279
- # {"input_ids": input_ids},
280
- # streamer=streamer,
281
- # max_new_tokens=max_new_tokens,
282
- # do_sample=True,
283
- # top_p=top_p,
284
- # top_k=top_k,
285
- # temperature=temperature,
286
- # num_beams=1,
287
- # repetition_penalty=repetition_penalty,
288
- # )
289
-
290
- # print("Starting generation...")
291
- # t = Thread(target=model.generate, kwargs=generate_kwargs)
292
- # t.start()
293
- # print("Thread started for model generation.")
294
-
295
- # outputs = []
296
- # for text in streamer:
297
- # outputs.append(text)
298
- # print(f"Generated text so far: {''.join(outputs)}")
299
- # yield "".join(outputs)
300
-
301
- # except Exception as e:
302
- # print(f"Error during generation: {e}")
303
- # raise e # Re-raise the error after logging it
304
-
305
-
306
- # chat_interface = gr.ChatInterface(
307
- # fn=generate,
308
- # additional_inputs=[
309
- # gr.Slider(
310
- # label="Max new tokens",
311
- # minimum=1,
312
- # maximum=MAX_MAX_NEW_TOKENS,
313
- # step=1,
314
- # value=DEFAULT_MAX_NEW_TOKENS,
315
- # ),
316
- # gr.Slider(
317
- # label="Temperature",
318
- # minimum=0.1,
319
- # maximum=4.0,
320
- # step=0.1,
321
- # value=0.6,
322
- # ),
323
- # gr.Slider(
324
- # label="Top-p (nucleus sampling)",
325
- # minimum=0.05,
326
- # maximum=1.0,
327
- # step=0.05,
328
- # value=0.9,
329
- # ),
330
- # gr.Slider(
331
- # label="Top-k",
332
- # minimum=1,
333
- # maximum=1000,
334
- # step=1,
335
- # value=50,
336
- # ),
337
- # gr.Slider(
338
- # label="Repetition penalty",
339
- # minimum=1.0,
340
- # maximum=2.0,
341
- # step=0.05,
342
- # value=1.2,
343
- # ),
344
- # ],
345
- # stop_btn=None,
346
- # examples=[
347
- # ["Hello there! How are you doing?"],
348
- # ["Can you explain briefly to me what is the Python programming language?"],
349
- # ["Explain the plot of Cinderella in a sentence."],
350
- # ["How many hours does it take a man to eat a Helicopter?"],
351
- # ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
352
- # ],
353
- # )
354
-
355
- # # Debugging: Interface setup
356
- # print("Setting up interface...")
357
-
358
- # with gr.Blocks(css="style.css") as demo:
359
- # gr.Markdown(DESCRIPTION)
360
- # gr.DuplicateButton(
361
- # value="Duplicate Space for private use",
362
- # elem_id="duplicate-button",
363
- # visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
364
- # )
365
- # chat_interface.render()
366
-
367
- # # Debugging: Starting queue and launching the demo
368
- # print("Launching demo...")
369
-
370
- # if __name__ == "__main__":
371
- # demo.queue(max_size=20).launch(share=True)
 
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ from threading import Thread
5
+ from typing import Iterator
6
+
7
+ import gradio as gr
8
+ import spaces
9
+ import torch
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
+
12
+ # Debugging: Start script
13
+ print("Starting script...")
14
+
15
+ HF_TOKEN = os.environ.get("HF_TOKEN")
16
+ if HF_TOKEN is None:
17
+ print("Warning: HF_TOKEN is not set!")
18
+
19
+ PASSWORD = os.getenv("APP_PASSWORD", "mysecretpassword") # Set your desired password here or via environment variable
20
+
21
+ DESCRIPTION = "# FT of Lama"
22
+
23
+ if not torch.cuda.is_available():
24
+ DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
25
+ print("Warning: No GPU available. This model cannot run on CPU.")
26
+ else:
27
+ print("GPU is available!")
28
+
29
+ MAX_MAX_NEW_TOKENS = 2048
30
+ DEFAULT_MAX_NEW_TOKENS = 1024
31
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
32
+
33
+ # Debugging: GPU check passed, loading model
34
+ if torch.cuda.is_available():
35
+ model_id = "torVik/bggpt-Instruct-bglawinsv1UNS"
36
+ try:
37
+ print("Loading model...")
38
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN)
39
+ print("Model loaded successfully!")
40
+
41
+ print("Loading tokenizer...")
42
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
43
+ print("Tokenizer loaded successfully!")
44
+ except Exception as e:
45
+ print(f"Error loading model or tokenizer: {e}")
46
+ raise e # Re-raise the error after logging it
47
+
48
+
49
+ @spaces.GPU
50
+ def generate(
51
+ message: str,
52
+ chat_history: list[tuple[str, str]],
53
+ max_new_tokens: int = 1024,
54
+ temperature: float = 0.6,
55
+ top_p: float = 0.9,
56
+ top_k: int = 50,
57
+ repetition_penalty: float = 1.2,
58
+ ) -> Iterator[str]:
59
+ print(f"Received message: {message}")
60
+ print(f"Chat history: {chat_history}")
61
+
62
+ conversation = []
63
+ for user, assistant in chat_history:
64
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
65
+ conversation.append({"role": "user", "content": message})
66
+
67
+ try:
68
+ print("Tokenizing input...")
69
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
70
+ print(f"Input tokenized: {input_ids.shape}")
71
+
72
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
73
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
74
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
75
+ print("Trimmed input tokens due to length.")
76
+
77
+ input_ids = input_ids.to(model.device)
78
+ print("Input moved to the model's device.")
79
+
80
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
81
+ generate_kwargs = dict(
82
+ {"input_ids": input_ids},
83
+ streamer=streamer,
84
+ max_new_tokens=max_new_tokens,
85
+ do_sample=True,
86
+ top_p=top_p,
87
+ top_k=top_k,
88
+ temperature=temperature,
89
+ num_beams=1,
90
+ repetition_penalty=repetition_penalty,
91
+ )
92
+
93
+ print("Starting generation...")
94
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
95
+ t.start()
96
+ print("Thread started for model generation.")
97
+
98
+ outputs = []
99
+ for text in streamer:
100
+ outputs.append(text)
101
+ print(f"Generated text so far: {''.join(outputs)}")
102
+ yield "".join(outputs)
103
+
104
+ except Exception as e:
105
+ print(f"Error during generation: {e}")
106
+ raise e # Re-raise the error after logging it
107
+
108
+
109
+ def password_auth(password):
110
+ if password == PASSWORD:
111
+ return gr.update(visible=True), gr.update(visible=False)
112
+ else:
113
+ return gr.update(visible=False), gr.update(visible=True, value="Incorrect password. Try again.")
114
+
115
+ chat_interface = gr.ChatInterface(
116
+ fn=generate,
117
+ additional_inputs=[
118
+ gr.Slider(
119
+ label="Max new tokens",
120
+ minimum=1,
121
+ maximum=MAX_MAX_NEW_TOKENS,
122
+ step=1,
123
+ value=DEFAULT_MAX_NEW_TOKENS,
124
+ ),
125
+ gr.Slider(
126
+ label="Temperature",
127
+ minimum=0.1,
128
+ maximum=4.0,
129
+ step=0.1,
130
+ value=0.6,
131
+ ),
132
+ gr.Slider(
133
+ label="Top-p (nucleus sampling)",
134
+ minimum=0.05,
135
+ maximum=1.0,
136
+ step=0.05,
137
+ value=0.9,
138
+ ),
139
+ gr.Slider(
140
+ label="Top-k",
141
+ minimum=1,
142
+ maximum=1000,
143
+ step=1,
144
+ value=50,
145
+ ),
146
+ gr.Slider(
147
+ label="Repetition penalty",
148
+ minimum=1.0,
149
+ maximum=2.0,
150
+ step=0.05,
151
+ value=1.2,
152
+ ),
153
+ ],
154
+ stop_btn=None,
155
+ examples=[
156
+ ["Hello there! How are you doing?"],
157
+ ["Can you explain briefly to me what is the Python programming language?"],
158
+ ["Explain the plot of Cinderella in a sentence."],
159
+ ["How many hours does it take a man to eat a Helicopter?"],
160
+ ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
161
+ ],
162
+ )
163
+
164
+ # Debugging: Interface setup
165
+ print("Setting up interface...")
166
+
167
+ with gr.Blocks(css="style.css") as demo:
168
+ gr.Markdown(DESCRIPTION)
169
+
170
+ # Create login components
171
+ with gr.Row(visible=True) as login_area:
172
+ password_input = gr.Textbox(
173
+ label="Enter Password", type="password", placeholder="Password", show_label=True
174
+ )
175
+ login_btn = gr.Button("Submit")
176
+ incorrect_password_msg = gr.Markdown("Incorrect password. Try again.", visible=False)
177
+
178
+ # Main chat interface
179
+ with gr.Column(visible=False) as chat_area:
180
+ gr.Markdown(DESCRIPTION)
181
+ gr.DuplicateButton(
182
+ value="Duplicate Space for private use",
183
+ elem_id="duplicate-button",
184
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
185
+ )
186
+ chat_interface.render()
187
+
188
+ # Bind login button to check password
189
+ login_btn.click(password_auth, inputs=password_input, outputs=[chat_area, incorrect_password_msg])
190
+
191
+ # Debugging: Starting queue and launching the demo
192
+ print("Launching demo...")
193
+
194
+ if __name__ == "__main__":
195
+ demo.queue(max_size=20).launch(share=True)
196
+
197
+
198
+
199
+ # WORKING
200
+ # #!/usr/bin/env python
201
+
202
+ # import os
203
+ # from threading import Thread
204
+ # from typing import Iterator
205
+
206
+ # import gradio as gr
207
+ # import spaces
208
+ # import torch
209
+ # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
210
+
211
+ # # Debugging: Start script
212
+ # print("Starting script...")
213
+
214
+ # HF_TOKEN = os.environ.get("HF_TOKEN")
215
+ # if HF_TOKEN is None:
216
+ # print("Warning: HF_TOKEN is not set!")
217
+
218
+ # DESCRIPTION = "# Mistral-7B v0.2"
219
+
220
+ # if not torch.cuda.is_available():
221
+ # DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
222
+ # print("Warning: No GPU available. This model cannot run on CPU.")
223
+ # else:
224
+ # print("GPU is available!")
225
+
226
+ # MAX_MAX_NEW_TOKENS = 2048
227
+ # DEFAULT_MAX_NEW_TOKENS = 1024
228
+ # MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
229
+
230
+ # # Debugging: GPU check passed, loading model
231
+ # if torch.cuda.is_available():
232
+ # model_id = "mistralai/Mistral-7B-Instruct-v0.2"
233
+ # try:
234
+ # print("Loading model...")
235
+ # model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=HF_TOKEN)
236
+ # print("Model loaded successfully!")
237
+
238
+ # print("Loading tokenizer...")
239
+ # tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
240
+ # print("Tokenizer loaded successfully!")
241
+ # except Exception as e:
242
+ # print(f"Error loading model or tokenizer: {e}")
243
+ # raise e # Re-raise the error after logging it
244
+
245
+
246
+ # @spaces.GPU
247
+ # def generate(
248
+ # message: str,
249
+ # chat_history: list[tuple[str, str]],
250
+ # max_new_tokens: int = 1024,
251
+ # temperature: float = 0.6,
252
+ # top_p: float = 0.9,
253
+ # top_k: int = 50,
254
+ # repetition_penalty: float = 1.2,
255
+ # ) -> Iterator[str]:
256
+ # print(f"Received message: {message}")
257
+ # print(f"Chat history: {chat_history}")
258
+
259
+ # conversation = []
260
+ # for user, assistant in chat_history:
261
+ # conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
262
+ # conversation.append({"role": "user", "content": message})
263
+
264
+ # try:
265
+ # print("Tokenizing input...")
266
+ # input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
267
+ # print(f"Input tokenized: {input_ids.shape}")
268
+
269
+ # if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
270
+ # input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
271
+ # gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
272
+ # print("Trimmed input tokens due to length.")
273
+
274
+ # input_ids = input_ids.to(model.device)
275
+ # print("Input moved to the model's device.")
276
+
277
+ # streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
278
+ # generate_kwargs = dict(
279
+ # {"input_ids": input_ids},
280
+ # streamer=streamer,
281
+ # max_new_tokens=max_new_tokens,
282
+ # do_sample=True,
283
+ # top_p=top_p,
284
+ # top_k=top_k,
285
+ # temperature=temperature,
286
+ # num_beams=1,
287
+ # repetition_penalty=repetition_penalty,
288
+ # )
289
+
290
+ # print("Starting generation...")
291
+ # t = Thread(target=model.generate, kwargs=generate_kwargs)
292
+ # t.start()
293
+ # print("Thread started for model generation.")
294
+
295
+ # outputs = []
296
+ # for text in streamer:
297
+ # outputs.append(text)
298
+ # print(f"Generated text so far: {''.join(outputs)}")
299
+ # yield "".join(outputs)
300
+
301
+ # except Exception as e:
302
+ # print(f"Error during generation: {e}")
303
+ # raise e # Re-raise the error after logging it
304
+
305
+
306
+ # chat_interface = gr.ChatInterface(
307
+ # fn=generate,
308
+ # additional_inputs=[
309
+ # gr.Slider(
310
+ # label="Max new tokens",
311
+ # minimum=1,
312
+ # maximum=MAX_MAX_NEW_TOKENS,
313
+ # step=1,
314
+ # value=DEFAULT_MAX_NEW_TOKENS,
315
+ # ),
316
+ # gr.Slider(
317
+ # label="Temperature",
318
+ # minimum=0.1,
319
+ # maximum=4.0,
320
+ # step=0.1,
321
+ # value=0.6,
322
+ # ),
323
+ # gr.Slider(
324
+ # label="Top-p (nucleus sampling)",
325
+ # minimum=0.05,
326
+ # maximum=1.0,
327
+ # step=0.05,
328
+ # value=0.9,
329
+ # ),
330
+ # gr.Slider(
331
+ # label="Top-k",
332
+ # minimum=1,
333
+ # maximum=1000,
334
+ # step=1,
335
+ # value=50,
336
+ # ),
337
+ # gr.Slider(
338
+ # label="Repetition penalty",
339
+ # minimum=1.0,
340
+ # maximum=2.0,
341
+ # step=0.05,
342
+ # value=1.2,
343
+ # ),
344
+ # ],
345
+ # stop_btn=None,
346
+ # examples=[
347
+ # ["Hello there! How are you doing?"],
348
+ # ["Can you explain briefly to me what is the Python programming language?"],
349
+ # ["Explain the plot of Cinderella in a sentence."],
350
+ # ["How many hours does it take a man to eat a Helicopter?"],
351
+ # ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
352
+ # ],
353
+ # )
354
+
355
+ # # Debugging: Interface setup
356
+ # print("Setting up interface...")
357
+
358
+ # with gr.Blocks(css="style.css") as demo:
359
+ # gr.Markdown(DESCRIPTION)
360
+ # gr.DuplicateButton(
361
+ # value="Duplicate Space for private use",
362
+ # elem_id="duplicate-button",
363
+ # visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
364
+ # )
365
+ # chat_interface.render()
366
+
367
+ # # Debugging: Starting queue and launching the demo
368
+ # print("Launching demo...")
369
+
370
+ # if __name__ == "__main__":
371
+ # demo.queue(max_size=20).launch(share=True)