KingNish commited on
Commit
29c0cfc
·
verified ·
1 Parent(s): 01335c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -20
app.py CHANGED
@@ -1,34 +1,40 @@
1
  import gradio as gr
2
  import spaces
3
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
4
  from qwen_vl_utils import process_vision_info
5
  import torch
6
  from PIL import Image
7
  import subprocess
8
  import numpy as np
9
  import os
10
-
11
- # Install flash-attn
12
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
13
 
14
  # Model and Processor Loading (Done once at startup)
15
  MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
16
- model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, attn_implementation="flash_attention_2", trust_remote_code=True, torch_dtype=torch.float16).to("cuda").eval()
 
 
 
 
17
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
18
 
19
  DESCRIPTION = "[Qwen2-VL-2B Demo](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)"
20
 
 
 
 
 
21
  @spaces.GPU
22
  def qwen_inference(media_path, text_input=None):
23
-
24
- image_extensions = Image.registered_extensions()
25
  if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
26
  media_type = "image"
27
- elif media_path.endswith(("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")): # Check if it's a video path
28
  media_type = "video"
29
  else:
30
- raise ValueError("Unsupported media type. Please upload an image or video.")
31
-
 
 
32
  messages = [
33
  {
34
  "role": "user",
@@ -36,15 +42,17 @@ def qwen_inference(media_path, text_input=None):
36
  {
37
  "type": media_type,
38
  media_type: media_path,
39
- **({"fps": 8.0} if media_type == "video" else {}),
40
  },
41
  {"type": "text", "text": text_input},
42
  ],
43
  }
44
  ]
45
 
46
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
47
- image_inputs, video_inputs = process_vision_info(messages)
 
 
48
  inputs = processor(
49
  text=[text],
50
  images=image_inputs,
@@ -53,11 +61,19 @@ def qwen_inference(media_path, text_input=None):
53
  return_tensors="pt",
54
  ).to("cuda")
55
 
56
- generated_ids = model.generate(**inputs, max_new_tokens=1024)
57
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
58
- output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
59
-
60
- return output_text
 
 
 
 
 
 
 
 
61
 
62
  css = """
63
  #output {
@@ -73,12 +89,16 @@ with gr.Blocks(css=css) as demo:
73
  with gr.Tab(label="Image/Video Input"):
74
  with gr.Row():
75
  with gr.Column():
76
- input_media = gr.File(label="Upload Image or Video", type="filepath")
 
 
77
  text_input = gr.Textbox(label="Question")
78
  submit_btn = gr.Button(value="Submit")
79
  with gr.Column():
80
  output_text = gr.Textbox(label="Output Text")
81
 
82
- submit_btn.click(qwen_inference, [input_media, text_input], [output_text])
 
 
83
 
84
  demo.launch(debug=True)
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
4
  from qwen_vl_utils import process_vision_info
5
  import torch
6
  from PIL import Image
7
  import subprocess
8
  import numpy as np
9
  import os
10
+ from threading import Thread
 
 
11
 
12
  # Model and Processor Loading (Done once at startup)
13
  MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
14
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
15
+ MODEL_ID,
16
+ trust_remote_code=True,
17
+ torch_dtype=torch.float16
18
+ ).to("cuda").eval()
19
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
20
 
21
  DESCRIPTION = "[Qwen2-VL-2B Demo](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)"
22
 
23
+ image_extensions = Image.registered_extensions()
24
+ video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
25
+
26
+
27
  @spaces.GPU
28
  def qwen_inference(media_path, text_input=None):
 
 
29
  if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
30
  media_type = "image"
31
+ elif media_path.endswith(video_extensions): # Check if it's a video path
32
  media_type = "video"
33
  else:
34
+ raise ValueError(
35
+ "Unsupported media type. Please upload an image or video."
36
+ )
37
+
38
  messages = [
39
  {
40
  "role": "user",
 
42
  {
43
  "type": media_type,
44
  media_type: media_path,
45
+ **({"fps": 8.0} if media_type == "video" else {}),
46
  },
47
  {"type": "text", "text": text_input},
48
  ],
49
  }
50
  ]
51
 
52
+ text = processor.apply_chat_template(
53
+ messages, tokenize=False, add_generation_prompt=True
54
+ )
55
+ image_inputs, video_inputs = process_vision_info(messages)
56
  inputs = processor(
57
  text=[text],
58
  images=image_inputs,
 
61
  return_tensors="pt",
62
  ).to("cuda")
63
 
64
+ streamer = TextIteratorStreamer(
65
+ processor, skip_prompt=True, **{"skip_special_tokens": True}
66
+ )
67
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
68
+
69
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
70
+ thread.start()
71
+
72
+ buffer = ""
73
+ for new_text in streamer:
74
+ buffer += new_text
75
+ yield buffer
76
+
77
 
78
  css = """
79
  #output {
 
89
  with gr.Tab(label="Image/Video Input"):
90
  with gr.Row():
91
  with gr.Column():
92
+ input_media = gr.File(
93
+ label="Upload Image or Video", type="filepath"
94
+ )
95
  text_input = gr.Textbox(label="Question")
96
  submit_btn = gr.Button(value="Submit")
97
  with gr.Column():
98
  output_text = gr.Textbox(label="Output Text")
99
 
100
+ submit_btn.click(
101
+ qwen_inference, [input_media, text_input], [output_text]
102
+ )
103
 
104
  demo.launch(debug=True)