littlebird13 commited on
Commit
97151b3
·
verified ·
1 Parent(s): e7e2d1b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +422 -0
app.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from http import HTTPStatus
3
+ import uuid
4
+ from gradio_client import utils as client_utils
5
+ import gradio.processing_utils as processing_utils
6
+ import base64
7
+ from openai import OpenAI
8
+ import soundfile as sf
9
+ import numpy as np
10
+ import io
11
+ import os
12
+ import modelscope_studio.components.base as ms
13
+ import modelscope_studio.components.antd as antd
14
+ import oss2
15
+ from oss2.credentials import EnvironmentVariableCredentialsProvider
16
+
17
+ # Voice settings
18
+ VOICE_LIST = ['Cherry', 'Ethan', 'Serena', 'Chelsie']
19
+ DEFAULT_VOICE = 'Cherry'
20
+
21
+ # OSS_ACCESS_KEY_ID and OSS_ACCESS_KEY_SECRET。
22
+ auth = oss2.ProviderAuthV4(EnvironmentVariableCredentialsProvider())
23
+
24
+ endpoint = os.getenv("OSS_ENDPOINT")
25
+
26
+ region = os.getenv("OSS_REGION")
27
+
28
+ bucket_name = os.getenv("OSS_BUCKET_NAME")
29
+
30
+ bucket = oss2.Bucket(auth, endpoint, bucket_name, region=region)
31
+
32
+ default_system_prompt = 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'
33
+
34
+ API_KEY = os.environ['API_KEY']
35
+
36
+ client = OpenAI(
37
+ api_key=API_KEY,
38
+ base_url="https://poc-dashscope.aliyuncs.com/compatible-mode/v1",
39
+ )
40
+
41
+ is_modelscope_studio = os.getenv('MODELSCOPE_ENVIRONMENT') == 'studio'
42
+
43
+
44
+ def get_text(text: str, cn_text: str):
45
+ if is_modelscope_studio:
46
+ return cn_text
47
+ return text
48
+
49
+
50
+ def encode_file_to_base64(file_path):
51
+ with open(file_path, "rb") as file:
52
+ mime_type = client_utils.get_mimetype(file_path)
53
+ bae64_data = base64.b64encode(file.read()).decode("utf-8")
54
+ return f"data:{mime_type};base64,{bae64_data}"
55
+
56
+
57
+ def file_path_to_oss_url(file_path: str):
58
+ if file_path.startswith("http"):
59
+ return file_path
60
+ ext = file_path.split('.')[-1]
61
+ object_name = f'studio-temp/Qwen2.5-Omni-Demo/{uuid.uuid4()}.{ext}'
62
+ response = bucket.put_object_from_file(object_name, file_path)
63
+ file_url = file_path
64
+ if response.status == HTTPStatus.OK:
65
+ file_url = bucket.sign_url('GET',
66
+ object_name,
67
+ 60 * 60,
68
+ slash_safe=True)
69
+ return file_url
70
+
71
+
72
+ def format_history(history: list, system_prompt: str, oss_cache):
73
+ messages = []
74
+ messages.append({"role": "system", "content": system_prompt})
75
+ for item in history:
76
+ if isinstance(item["content"], str):
77
+ messages.append({"role": item['role'], "content": item['content']})
78
+ elif item["role"] == "user" and (isinstance(item["content"], list) or
79
+ isinstance(item["content"], tuple)):
80
+ file_path = item["content"][0]
81
+
82
+ file_url = oss_cache.get(file_path,
83
+ file_path_to_oss_url(file_path))
84
+ oss_cache[file_path] = file_url
85
+
86
+ file_url = file_url if file_url.startswith(
87
+ "http") else encode_file_to_base64(file_path=file_path)
88
+
89
+ mime_type = client_utils.get_mimetype(file_path)
90
+ ext = file_path.split('.')[-1]
91
+
92
+ if mime_type.startswith("image"):
93
+ messages.append({
94
+ "role":
95
+ item['role'],
96
+ "content": [{
97
+ "type": "image_url",
98
+ "image_url": {
99
+ "url": file_url
100
+ }
101
+ }]
102
+ })
103
+ elif mime_type.startswith("video"):
104
+ messages.append({
105
+ "role":
106
+ item['role'],
107
+ "content": [{
108
+ "type": "video_url",
109
+ "video_url": {
110
+ "url": file_url
111
+ }
112
+ }]
113
+ })
114
+ elif mime_type.startswith("audio"):
115
+ messages.append({
116
+ "role":
117
+ item['role'],
118
+ "content": [{
119
+ "type": "input_audio",
120
+ "input_audio": {
121
+ "data": file_url,
122
+ "format": ext
123
+ }
124
+ }]
125
+ })
126
+ return messages
127
+
128
+
129
+ def predict(messages, voice=DEFAULT_VOICE):
130
+ print('predict history: ', messages)
131
+ completion = client.chat.completions.create(
132
+ model="pre-qwenvl-omni-perf-2",
133
+ messages=messages,
134
+ modalities=["text", "audio"],
135
+ audio={
136
+ "voice": voice,
137
+ "format": "wav"
138
+ },
139
+ stream=True,
140
+ stream_options={"include_usage": True})
141
+
142
+ response_text = ""
143
+ audio_str = ""
144
+ for chunk in completion:
145
+ if chunk.choices:
146
+ delta = chunk.choices[0].delta
147
+ if hasattr(
148
+ delta,
149
+ 'audio') and delta.audio and delta.audio.get("transcript"):
150
+ response_text += delta.audio.get("transcript")
151
+ if hasattr(delta,
152
+ 'audio') and delta.audio and delta.audio.get("data"):
153
+ audio_str += delta.audio.get("data")
154
+ yield {"type": "text", "data": response_text}
155
+ pcm_bytes = base64.b64decode(audio_str)
156
+ audio_np = np.frombuffer(pcm_bytes, dtype=np.int16)
157
+ wav_io = io.BytesIO()
158
+ sf.write(wav_io, audio_np, samplerate=24000, format="WAV")
159
+ wav_io.seek(0)
160
+ wav_bytes = wav_io.getvalue()
161
+ audio_path = processing_utils.save_bytes_to_cache(
162
+ wav_bytes, "audio.wav", cache_dir=demo.GRADIO_CACHE)
163
+ yield {"type": "audio", "data": audio_path}
164
+
165
+
166
+ def media_predict(audio, video, history, system_prompt, state_value,
167
+ voice_choice):
168
+ files = [audio, video]
169
+ for f in files:
170
+ if f:
171
+ history.append({"role": "user", "content": (f, )})
172
+
173
+ formatted_history = format_history(history=history,
174
+ system_prompt=system_prompt,
175
+ oss_cache=state_value["oss_cache"])
176
+
177
+ # First yield
178
+ yield (
179
+ None, # microphone
180
+ None, # webcam
181
+ history, # media_chatbot
182
+ gr.update(visible=False), # submit_btn
183
+ gr.update(visible=True), # stop_btn
184
+ state_value # state
185
+ )
186
+
187
+ history.append({"role": "assistant", "content": ""})
188
+
189
+ for chunk in predict(formatted_history, voice_choice):
190
+ if chunk["type"] == "text":
191
+ history[-1]["content"] = chunk["data"]
192
+ yield (
193
+ None, # microphone
194
+ None, # webcam
195
+ history, # media_chatbot
196
+ gr.update(visible=False), # submit_btn
197
+ gr.update(visible=True), # stop_btn
198
+ state_value # state
199
+ )
200
+ if chunk["type"] == "audio":
201
+ history.append({
202
+ "role": "assistant",
203
+ "content": gr.Audio(chunk["data"])
204
+ })
205
+
206
+ # Final yield
207
+ yield (
208
+ None, # microphone
209
+ None, # webcam
210
+ history, # media_chatbot
211
+ gr.update(visible=True), # submit_btn
212
+ gr.update(visible=False), # stop_btn
213
+ state_value # state
214
+ )
215
+
216
+
217
+ def chat_predict(text, audio, image, video, history, system_prompt,
218
+ state_value, voice_choice):
219
+ # Process text input
220
+ if text:
221
+ history.append({"role": "user", "content": text})
222
+
223
+ # Process audio input
224
+ if audio:
225
+ history.append({"role": "user", "content": (audio, )})
226
+
227
+ # Process image input
228
+ if image:
229
+ history.append({"role": "user", "content": (image, )})
230
+
231
+ # Process video input
232
+ if video:
233
+ history.append({"role": "user", "content": (video, )})
234
+
235
+ formatted_history = format_history(history=history,
236
+ system_prompt=system_prompt,
237
+ oss_cache=state_value["oss_cache"])
238
+
239
+ yield None, None, None, None, history, state_value
240
+
241
+ history.append({"role": "assistant", "content": ""})
242
+ for chunk in predict(formatted_history, voice_choice):
243
+ if chunk["type"] == "text":
244
+ history[-1]["content"] = chunk["data"]
245
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(
246
+ ), history, state_value
247
+ if chunk["type"] == "audio":
248
+ history.append({
249
+ "role": "assistant",
250
+ "content": gr.Audio(chunk["data"])
251
+ })
252
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), history, state_value
253
+
254
+
255
+ with gr.Blocks() as demo, ms.Application(), antd.ConfigProvider():
256
+ state = gr.State({"oss_cache": {}})
257
+
258
+ with gr.Sidebar(open=False):
259
+ system_prompt_textbox = gr.Textbox(label="System Prompt",
260
+ value=default_system_prompt)
261
+ voice_choice = gr.Dropdown(label="Voice Choice",
262
+ choices=VOICE_LIST,
263
+ value=DEFAULT_VOICE)
264
+ with antd.Flex(gap="small", justify="center", align="center"):
265
+ antd.Image('./logo-1.png', preview=False, width=67, height=67)
266
+ with antd.Flex(vertical=True, gap="small", align="center"):
267
+ antd.Typography.Title("Qwen2.5-Omni Demo",
268
+ level=1,
269
+ elem_style=dict(margin=0, fontSize=28))
270
+ with antd.Flex(vertical=True, gap="small"):
271
+ antd.Typography.Text(get_text("🎯 Instructions for use:",
272
+ "🎯 使用说明:"),
273
+ strong=True)
274
+ antd.Typography.Text(
275
+ get_text(
276
+ "1️⃣ Click the Audio Record button or the Camera Record button.",
277
+ "1️⃣ 点击音频录制按钮,或摄像头-录制按钮"))
278
+ antd.Typography.Text(
279
+ get_text("2️⃣ Input audio or video.", "2️⃣ 输入音频或者视频"))
280
+ antd.Typography.Text(
281
+ get_text(
282
+ "3️⃣ Click the submit button and wait for the model's response.",
283
+ "3️⃣ 点击提交并等待模型的回答"))
284
+ antd.Image('./logo-2.png',
285
+ preview=False,
286
+ width=80,
287
+ height=80,
288
+ elem_style=dict(marginTop=5))
289
+ with gr.Tabs():
290
+ with gr.Tab("Online"):
291
+ with gr.Row():
292
+ with gr.Column(scale=1):
293
+ microphone = gr.Audio(sources=['microphone'],
294
+ format="wav",
295
+ type="filepath")
296
+ webcam = gr.Video(sources=['webcam'],
297
+ format="mp4",
298
+ height=400,
299
+ include_audio=True)
300
+ submit_btn = gr.Button(get_text("Submit", "提交"),
301
+ variant="primary")
302
+ stop_btn = gr.Button(get_text("Stop", "停止"), visible=False)
303
+ clear_btn = gr.Button(get_text("Clear History", "清除历史"))
304
+ with gr.Column(scale=2):
305
+ media_chatbot = gr.Chatbot(height=650, type="messages")
306
+
307
+ def clear_history():
308
+ return [], gr.update(value=None), gr.update(value=None)
309
+
310
+ submit_event = submit_btn.click(fn=media_predict,
311
+ inputs=[
312
+ microphone, webcam,
313
+ media_chatbot,
314
+ system_prompt_textbox,
315
+ state, voice_choice
316
+ ],
317
+ outputs=[
318
+ microphone, webcam,
319
+ media_chatbot, submit_btn,
320
+ stop_btn, state
321
+ ])
322
+ stop_btn.click(
323
+ fn=lambda:
324
+ (gr.update(visible=True), gr.update(visible=False)),
325
+ inputs=None,
326
+ outputs=[submit_btn, stop_btn],
327
+ cancels=[submit_event],
328
+ queue=False)
329
+ clear_btn.click(fn=clear_history,
330
+ inputs=None,
331
+ outputs=[media_chatbot, microphone, webcam])
332
+
333
+ with gr.Tab("Offline"):
334
+ chatbot = gr.Chatbot(type="messages", height=650)
335
+
336
+ # Media upload section in one row
337
+ with gr.Row(equal_height=True):
338
+ audio_input = gr.Audio(sources=["upload"],
339
+ type="filepath",
340
+ label="Upload Audio",
341
+ elem_classes="media-upload",
342
+ scale=1)
343
+ image_input = gr.Image(sources=["upload"],
344
+ type="filepath",
345
+ label="Upload Image",
346
+ elem_classes="media-upload",
347
+ scale=1)
348
+ video_input = gr.Video(sources=["upload"],
349
+ label="Upload Video",
350
+ elem_classes="media-upload",
351
+ scale=1)
352
+
353
+ # Text input section
354
+ text_input = gr.Textbox(show_label=False,
355
+ placeholder="Enter text here...")
356
+
357
+ # Control buttons
358
+ with gr.Row():
359
+ submit_btn = gr.Button(get_text("Submit", "提交"),
360
+ variant="primary",
361
+ size="lg")
362
+ stop_btn = gr.Button(get_text("Stop", "停止"),
363
+ visible=False,
364
+ size="lg")
365
+ clear_btn = gr.Button(get_text("Clear History", "清除历史"),
366
+ size="lg")
367
+
368
+ def clear_chat_history():
369
+ return [], gr.update(value=None), gr.update(
370
+ value=None), gr.update(value=None), gr.update(value=None)
371
+
372
+ submit_event = gr.on(
373
+ triggers=[submit_btn.click, text_input.submit],
374
+ fn=chat_predict,
375
+ inputs=[
376
+ text_input, audio_input, image_input, video_input, chatbot,
377
+ system_prompt_textbox, state, voice_choice
378
+ ],
379
+ outputs=[
380
+ text_input, audio_input, image_input, video_input, chatbot,
381
+ state
382
+ ])
383
+
384
+ stop_btn.click(fn=lambda:
385
+ (gr.update(visible=True), gr.update(visible=False)),
386
+ inputs=None,
387
+ outputs=[submit_btn, stop_btn],
388
+ cancels=[submit_event],
389
+ queue=False)
390
+
391
+ clear_btn.click(fn=clear_chat_history,
392
+ inputs=None,
393
+ outputs=[
394
+ chatbot, text_input, audio_input, image_input,
395
+ video_input
396
+ ])
397
+
398
+ # Add some custom CSS to improve the layout
399
+ gr.HTML("""
400
+ <style>
401
+ .media-upload {
402
+ margin: 10px;
403
+ min-height: 160px;
404
+ }
405
+ .media-upload > .wrap {
406
+ border: 2px dashed #ccc;
407
+ border-radius: 8px;
408
+ padding: 10px;
409
+ height: 100%;
410
+ }
411
+ .media-upload:hover > .wrap {
412
+ border-color: #666;
413
+ }
414
+ /* Make upload areas equal width */
415
+ .media-upload {
416
+ flex: 1;
417
+ min-width: 0;
418
+ }
419
+ </style>
420
+ """)
421
+
422
+ demo.queue(default_concurrency_limit=100, max_size=100).launch(max_threads=100)