finalf0 commited on
Commit
85737a6
1 Parent(s): 65c9869

initial commit

Browse files
Files changed (3) hide show
  1. README.md +4 -3
  2. app.py +549 -49
  3. requirements.txt +9 -1
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Gradio Chatbot
3
  emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
  ---
2
+ title: MiniCPM-V-2 6
3
  emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.22.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
app.py CHANGED
@@ -1,63 +1,563 @@
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
27
 
28
- response = ""
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 
 
 
 
 
 
 
 
 
44
  """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
59
- )
60
 
61
 
62
- if __name__ == "__main__":
63
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # encoding: utf-8
3
+ import spaces
4
+ import torch
5
+ import argparse
6
+ from transformers import AutoModel, AutoTokenizer
7
  import gradio as gr
8
+ from PIL import Image
9
+ from decord import VideoReader, cpu
10
+ import io
11
+ import os
12
+ import copy
13
+ import requests
14
+ import base64
15
+ import json
16
+ import traceback
17
+ import re
18
+ import modelscope_studio as mgr
19
 
20
+
21
+ # README, How to run demo on different devices
22
+
23
+ # For Nvidia GPUs.
24
+ # python web_demo_2.6.py --device cuda
25
+
26
+ # For Mac with MPS (Apple silicon or AMD GPUs).
27
+ # PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.6.py --device mps
28
+
29
+ # Argparser
30
+ parser = argparse.ArgumentParser(description='demo')
31
+ parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
32
+ parser.add_argument('--multi-gpus', action='store_true', default=False, help='use multi-gpus')
33
+ args = parser.parse_args()
34
+ device = args.device
35
+ assert device in ['cuda', 'mps']
36
+
37
+ # Load model
38
+ model_path = 'openbmb/MiniCPM-V-2_6'
39
+ if 'int4' in model_path:
40
+ if device == 'mps':
41
+ print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
42
+ exit()
43
+ #model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa')
44
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
45
+ else:
46
+ if args.multi_gpus:
47
+ from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
48
+ with init_empty_weights():
49
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
50
+ device_map = infer_auto_device_map(model, max_memory={0: "10GB", 1: "10GB"},
51
+ no_split_module_classes=['SiglipVisionTransformer', 'Qwen2DecoderLayer'])
52
+ device_id = device_map["llm.model.embed_tokens"]
53
+ device_map["llm.lm_head"] = device_id # firtt and last layer should be in same device
54
+ device_map["vpm"] = device_id
55
+ device_map["resampler"] = device_id
56
+ device_id2 = device_map["llm.model.layers.26"]
57
+ device_map["llm.model.layers.8"] = device_id2
58
+ device_map["llm.model.layers.9"] = device_id2
59
+ device_map["llm.model.layers.10"] = device_id2
60
+ device_map["llm.model.layers.11"] = device_id2
61
+ device_map["llm.model.layers.12"] = device_id2
62
+ device_map["llm.model.layers.13"] = device_id2
63
+ device_map["llm.model.layers.14"] = device_id2
64
+ device_map["llm.model.layers.15"] = device_id2
65
+ device_map["llm.model.layers.16"] = device_id2
66
+ #print(device_map)
67
+
68
+ model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
69
+ else:
70
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
71
+ model = model.to(device=device)
72
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
73
+ model.eval()
74
+
75
+
76
+
77
+
78
+ ERROR_MSG = "Error, please retry"
79
+ model_name = 'MiniCPM-V 2.6'
80
+ MAX_NUM_FRAMES = 64
81
+ IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
82
+ VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
83
+
84
+ def get_file_extension(filename):
85
+ return os.path.splitext(filename)[1].lower()
86
+
87
+ def is_image(filename):
88
+ return get_file_extension(filename) in IMAGE_EXTENSIONS
89
+
90
+ def is_video(filename):
91
+ return get_file_extension(filename) in VIDEO_EXTENSIONS
92
+
93
+
94
+ form_radio = {
95
+ 'choices': ['Beam Search', 'Sampling'],
96
+ #'value': 'Beam Search',
97
+ 'value': 'Sampling',
98
+ 'interactive': True,
99
+ 'label': 'Decode Type'
100
+ }
101
+
102
+
103
+ def create_component(params, comp='Slider'):
104
+ if comp == 'Slider':
105
+ return gr.Slider(
106
+ minimum=params['minimum'],
107
+ maximum=params['maximum'],
108
+ value=params['value'],
109
+ step=params['step'],
110
+ interactive=params['interactive'],
111
+ label=params['label']
112
+ )
113
+ elif comp == 'Radio':
114
+ return gr.Radio(
115
+ choices=params['choices'],
116
+ value=params['value'],
117
+ interactive=params['interactive'],
118
+ label=params['label']
119
+ )
120
+ elif comp == 'Button':
121
+ return gr.Button(
122
+ value=params['value'],
123
+ interactive=True
124
+ )
125
+
126
+
127
+ def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
128
+ return mgr.MultimodalInput(upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
129
+ upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
130
+ submit_button_props={'label': 'Submit'})
131
+
132
+
133
+ @spaces.GPU(duration=120)
134
+ def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
135
+ try:
136
+ print('msgs:', msgs)
137
+ answer = model.chat(
138
+ image=None,
139
+ msgs=msgs,
140
+ tokenizer=tokenizer,
141
+ **params
142
+ )
143
+ res = re.sub(r'(<box>.*</box>)', '', answer)
144
+ res = res.replace('<ref>', '')
145
+ res = res.replace('</ref>', '')
146
+ res = res.replace('<box>', '')
147
+ answer = res.replace('</box>', '')
148
+ print('answer:', answer)
149
+ return 0, answer, None, None
150
+ except Exception as e:
151
+ print(e)
152
+ traceback.print_exc()
153
+ return -1, ERROR_MSG, None, None
154
+
155
+
156
+ def encode_image(image):
157
+ if not isinstance(image, Image.Image):
158
+ if hasattr(image, 'path'):
159
+ image = Image.open(image.path).convert("RGB")
160
+ else:
161
+ image = Image.open(image.file.path).convert("RGB")
162
+ # resize to max_size
163
+ max_size = 448*16
164
+ if max(image.size) > max_size:
165
+ w,h = image.size
166
+ if w > h:
167
+ new_w = max_size
168
+ new_h = int(h * max_size / w)
169
+ else:
170
+ new_h = max_size
171
+ new_w = int(w * max_size / h)
172
+ image = image.resize((new_w, new_h), resample=Image.BICUBIC)
173
+ return image
174
+ ## save by BytesIO and convert to base64
175
+ #buffered = io.BytesIO()
176
+ #image.save(buffered, format="png")
177
+ #im_b64 = base64.b64encode(buffered.getvalue()).decode()
178
+ #return {"type": "image", "pairs": im_b64}
179
+
180
+
181
+ def encode_video(video):
182
+ def uniform_sample(l, n):
183
+ gap = len(l) / n
184
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
185
+ return [l[i] for i in idxs]
186
+
187
+ if hasattr(video, 'path'):
188
+ vr = VideoReader(video.path, ctx=cpu(0))
189
+ else:
190
+ vr = VideoReader(video.file.path, ctx=cpu(0))
191
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
192
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
193
+ if len(frame_idx)>MAX_NUM_FRAMES:
194
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
195
+ video = vr.get_batch(frame_idx).asnumpy()
196
+ video = [Image.fromarray(v.astype('uint8')) for v in video]
197
+ video = [encode_image(v) for v in video]
198
+ print('video frames:', len(video))
199
+ return video
200
+
201
+
202
+ def check_mm_type(mm_file):
203
+ if hasattr(mm_file, 'path'):
204
+ path = mm_file.path
205
+ else:
206
+ path = mm_file.file.path
207
+ if is_image(path):
208
+ return "image"
209
+ if is_video(path):
210
+ return "video"
211
+ return None
212
+
213
+
214
+ def encode_mm_file(mm_file):
215
+ if check_mm_type(mm_file) == 'image':
216
+ return [encode_image(mm_file)]
217
+ if check_mm_type(mm_file) == 'video':
218
+ return encode_video(mm_file)
219
+ return None
220
+
221
+ def make_text(text):
222
+ #return {"type": "text", "pairs": text} # # For remote call
223
+ return text
224
+
225
+ def encode_message(_question):
226
+ files = _question.files
227
+ question = _question.text
228
+ pattern = r"\[mm_media\]\d+\[/mm_media\]"
229
+ matches = re.split(pattern, question)
230
+ message = []
231
+ if len(matches) != len(files) + 1:
232
+ gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
233
+ assert len(matches) == len(files) + 1
234
+
235
+ text = matches[0].strip()
236
+ if text:
237
+ message.append(make_text(text))
238
+ for i in range(len(files)):
239
+ message += encode_mm_file(files[i])
240
+ text = matches[i + 1].strip()
241
+ if text:
242
+ message.append(make_text(text))
243
+ return message
244
 
245
 
246
+ def check_has_videos(_question):
247
+ images_cnt = 0
248
+ videos_cnt = 0
249
+ for file in _question.files:
250
+ if check_mm_type(file) == "image":
251
+ images_cnt += 1
252
+ else:
253
+ videos_cnt += 1
254
+ return images_cnt, videos_cnt
255
 
 
 
 
 
 
256
 
257
+ def count_video_frames(_context):
258
+ num_frames = 0
259
+ for message in _context:
260
+ for item in message["content"]:
261
+ #if item["type"] == "image": # For remote call
262
+ if isinstance(item, Image.Image):
263
+ num_frames += 1
264
+ return num_frames
265
 
 
266
 
267
+ def respond(_question, _chat_bot, _app_cfg, params_form):
268
+ print("[respond] question:", _question)
269
+ _context = _app_cfg['ctx'].copy()
270
+ _context.append({'role': 'user', 'content': encode_message(_question)})
 
 
 
 
271
 
272
+ images_cnt = _app_cfg['images_cnt']
273
+ videos_cnt = _app_cfg['videos_cnt']
274
+ files_cnts = check_has_videos(_question)
275
+ if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
276
+ gr.Warning("Only supports single video file input right now!")
277
+ return _question, _chat_bot, _app_cfg
278
 
279
+ if params_form == 'Beam Search':
280
+ params = {
281
+ 'sampling': False,
282
+ 'num_beams': 3,
283
+ 'repetition_penalty': 1.2,
284
+ "max_new_tokens": 2048
285
+ }
286
+ else:
287
+ params = {
288
+ 'sampling': True,
289
+ 'top_p': 0.8,
290
+ 'top_k': 100,
291
+ 'temperature': 0.7,
292
+ 'repetition_penalty': 1.05,
293
+ "max_new_tokens": 2048
294
+ }
295
+
296
+ if files_cnts[1] + videos_cnt > 0:
297
+ params["max_inp_length"] = 4352 # 4096+256
298
+ params["use_image_id"] = False
299
+ params["max_slice_nums"] = 1 if count_video_frames(_context) > 16 else 2
300
+
301
+ code, _answer, _, sts = chat("", _context, None, params)
302
+
303
+ images_cnt += files_cnts[0]
304
+ videos_cnt += files_cnts[1]
305
+ _context.append({"role": "assistant", "content": [make_text(_answer)]})
306
+ _chat_bot.append((_question, _answer))
307
+ if code == 0:
308
+ _app_cfg['ctx']=_context
309
+ _app_cfg['sts']=sts
310
+ _app_cfg['images_cnt'] = images_cnt
311
+ _app_cfg['videos_cnt'] = videos_cnt
312
+
313
+ upload_image_disabled = videos_cnt > 0
314
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
315
+ return create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg
316
+
317
+
318
+ def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
319
+ ctx = _app_cfg["ctx"]
320
+ message_item = []
321
+ if _image is not None:
322
+ image = Image.open(_image).convert("RGB")
323
+ ctx.append({"role": "user", "content": [encode_image(image), make_text(_user_message)]})
324
+ message_item.append({"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]})
325
+ else:
326
+ if _user_message:
327
+ ctx.append({"role": "user", "content": [make_text(_user_message)]})
328
+ message_item.append({"text": _user_message, "files": []})
329
+ else:
330
+ message_item.append(None)
331
+ if _assistant_message:
332
+ ctx.append({"role": "assistant", "content": [make_text(_assistant_message)]})
333
+ message_item.append({"text": _assistant_message, "files": []})
334
+ else:
335
+ message_item.append(None)
336
+
337
+ _chat_bot.append(message_item)
338
+ return None, "", "", _chat_bot, _app_cfg
339
+
340
+
341
+ def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form):
342
+ user_message_contents = []
343
+ _context = _app_cfg["ctx"].copy()
344
+ if _image:
345
+ image = Image.open(_image).convert("RGB")
346
+ user_message_contents += [encode_image(image)]
347
+ if _user_message:
348
+ user_message_contents += [make_text(_user_message)]
349
+ if user_message_contents:
350
+ _context.append({"role": "user", "content": user_message_contents})
351
+
352
+ if params_form == 'Beam Search':
353
+ params = {
354
+ 'sampling': False,
355
+ 'num_beams': 3,
356
+ 'repetition_penalty': 1.2,
357
+ "max_new_tokens": 2048
358
+ }
359
+ else:
360
+ params = {
361
+ 'sampling': True,
362
+ 'top_p': 0.8,
363
+ 'top_k': 100,
364
+ 'temperature': 0.7,
365
+ 'repetition_penalty': 1.05,
366
+ "max_new_tokens": 2048
367
+ }
368
+
369
+ code, _answer, _, sts = chat("", _context, None, params)
370
+
371
+ _context.append({"role": "assistant", "content": [make_text(_answer)]})
372
+
373
+ if _image:
374
+ _chat_bot.append([
375
+ {"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]},
376
+ {"text": _answer, "files": []}
377
+ ])
378
+ else:
379
+ _chat_bot.append([
380
+ {"text": _user_message, "files": [_image]},
381
+ {"text": _answer, "files": []}
382
+ ])
383
+ if code == 0:
384
+ _app_cfg['ctx']=_context
385
+ _app_cfg['sts']=sts
386
+ return None, '', '', _chat_bot, _app_cfg
387
+
388
+
389
+ def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form):
390
+ if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
391
+ gr.Warning('No question for regeneration.')
392
+ return '', _image, _user_message, _assistant_message, _chat_bot, _app_cfg
393
+ if _app_cfg["chat_type"] == "Chat":
394
+ images_cnt = _app_cfg['images_cnt']
395
+ videos_cnt = _app_cfg['videos_cnt']
396
+ _question = _chat_bot[-1][0]
397
+ _chat_bot = _chat_bot[:-1]
398
+ _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
399
+ files_cnts = check_has_videos(_question)
400
+ images_cnt -= files_cnts[0]
401
+ videos_cnt -= files_cnts[1]
402
+ _app_cfg['images_cnt'] = images_cnt
403
+ _app_cfg['videos_cnt'] = videos_cnt
404
+ upload_image_disabled = videos_cnt > 0
405
+ upload_video_disabled = videos_cnt > 0 or images_cnt > 0
406
+ _question, _chat_bot, _app_cfg = respond(_question, _chat_bot, _app_cfg, params_form)
407
+ return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
408
+ else:
409
+ last_message = _chat_bot[-1][0]
410
+ last_image = None
411
+ last_user_message = ''
412
+ if last_message.text:
413
+ last_user_message = last_message.text
414
+ if last_message.files:
415
+ last_image = last_message.files[0].file.path
416
+ _chat_bot = _chat_bot[:-1]
417
+ _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
418
+ _image, _user_message, _assistant_message, _chat_bot, _app_cfg = fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form)
419
+ return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
420
+
421
+
422
+ def flushed():
423
+ return gr.update(interactive=True)
424
+
425
+
426
+ def clear(txt_message, chat_bot, app_session):
427
+ txt_message.files.clear()
428
+ txt_message.text = ''
429
+ chat_bot = copy.deepcopy(init_conversation)
430
+ app_session['sts'] = None
431
+ app_session['ctx'] = []
432
+ app_session['images_cnt'] = 0
433
+ app_session['videos_cnt'] = 0
434
+ return create_multimodal_input(), chat_bot, app_session, None, '', ''
435
+
436
+
437
+ def select_chat_type(_tab, _app_cfg):
438
+ _app_cfg["chat_type"] = _tab
439
+ return _app_cfg
440
+
441
+
442
+ init_conversation = [
443
+ [
444
+ None,
445
+ {
446
+ # The first message of bot closes the typewriter.
447
+ "text": "You can talk to me now",
448
+ "flushing": False
449
+ }
450
+ ],
451
+ ]
452
+
453
+
454
+ css = """
455
+ video { height: auto !important; }
456
+ .example label { font-size: 16px;}
457
  """
458
+
459
+ introduction = """
460
+
461
+ ## Features:
462
+ 1. Chat with single image
463
+ 2. Chat with multiple images
464
+ 3. Chat with video
465
+ 4. In-context few-shot learning
466
+
467
+ Click `How to use` tab to see examples.
468
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
 
471
+ with gr.Blocks(css=css) as demo:
472
+ with gr.Tab(model_name):
473
+ with gr.Row():
474
+ with gr.Column(scale=1, min_width=300):
475
+ gr.Markdown(value=introduction)
476
+ params_form = create_component(form_radio, comp='Radio')
477
+ regenerate = create_component({'value': 'Regenerate'}, comp='Button')
478
+ clear_button = create_component({'value': 'Clear History'}, comp='Button')
479
+
480
+ with gr.Column(scale=3, min_width=500):
481
+ app_session = gr.State({'sts':None,'ctx':[], 'images_cnt': 0, 'videos_cnt': 0, 'chat_type': 'Chat'})
482
+ chat_bot = mgr.Chatbot(label=f"Chat with {model_name}", value=copy.deepcopy(init_conversation), height=600, flushing=False, bubble_full_width=False)
483
+
484
+ with gr.Tab("Chat") as chat_tab:
485
+ txt_message = create_multimodal_input()
486
+ chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
487
+
488
+ txt_message.submit(
489
+ respond,
490
+ [txt_message, chat_bot, app_session, params_form],
491
+ [txt_message, chat_bot, app_session]
492
+ )
493
+
494
+ with gr.Tab("Few Shot") as fewshot_tab:
495
+ fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
496
+ with gr.Row():
497
+ with gr.Column(scale=1):
498
+ image_input = gr.Image(type="filepath", sources=["upload"])
499
+ with gr.Column(scale=3):
500
+ user_message = gr.Textbox(label="User")
501
+ assistant_message = gr.Textbox(label="Assistant")
502
+ with gr.Row():
503
+ add_demonstration_button = gr.Button("Add Example")
504
+ generate_button = gr.Button(value="Generate", variant="primary")
505
+ add_demonstration_button.click(
506
+ fewshot_add_demonstration,
507
+ [image_input, user_message, assistant_message, chat_bot, app_session],
508
+ [image_input, user_message, assistant_message, chat_bot, app_session]
509
+ )
510
+ generate_button.click(
511
+ fewshot_respond,
512
+ [image_input, user_message, chat_bot, app_session, params_form],
513
+ [image_input, user_message, assistant_message, chat_bot, app_session]
514
+ )
515
+
516
+ chat_tab.select(
517
+ select_chat_type,
518
+ [chat_tab_label, app_session],
519
+ [app_session]
520
+ )
521
+ chat_tab.select( # do clear
522
+ clear,
523
+ [txt_message, chat_bot, app_session],
524
+ [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
525
+ )
526
+ fewshot_tab.select(
527
+ select_chat_type,
528
+ [fewshot_tab_label, app_session],
529
+ [app_session]
530
+ )
531
+ fewshot_tab.select( # do clear
532
+ clear,
533
+ [txt_message, chat_bot, app_session],
534
+ [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
535
+ )
536
+ chat_bot.flushed(
537
+ flushed,
538
+ outputs=[txt_message]
539
+ )
540
+ regenerate.click(
541
+ regenerate_button_clicked,
542
+ [txt_message, image_input, user_message, assistant_message, chat_bot, app_session, params_form],
543
+ [txt_message, image_input, user_message, assistant_message, chat_bot, app_session]
544
+ )
545
+ clear_button.click(
546
+ clear,
547
+ [txt_message, chat_bot, app_session],
548
+ [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
549
+ )
550
+
551
+ with gr.Tab("How to use"):
552
+ with gr.Column():
553
+ with gr.Row():
554
+ image_example = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/m_bear2.gif", label='1. Chat with single or multiple images', interactive=False, width=400, elem_classes="example")
555
+ example2 = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/video2.gif", label='2. Chat with video', interactive=False, width=400, elem_classes="example")
556
+ example3 = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/fshot.gif", label='3. Few shot', interactive=False, width=400, elem_classes="example")
557
+
558
+
559
+ # launch
560
+ #demo.launch(share=False, debug=True, show_api=False, server_port=8885, server_name="0.0.0.0")
561
+ demo.queue()
562
+ demo.launch()
563
+
requirements.txt CHANGED
@@ -1 +1,9 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
1
+ Pillow==10.1.0
2
+ torch==2.1.2
3
+ torchvision==0.16.2
4
+ transformers==4.40.0
5
+ sentencepiece==0.1.99
6
+ opencv-python
7
+ decord
8
+ gradio==4.22.0
9
+ http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl