Spaces:

akhil2808
/

Qwen2_VL72B_OCR

Runtime error

App Files Files Community

John6666 commited on Oct 9

Commit

b1ef8f1

•

1 Parent(s): 4f963ef

Upload 2 files

Browse files

It starts up but does not feel like it works.

Files changed (2) hide show

app.py +68 -58
requirements.txt +7 -6

app.py CHANGED Viewed

@@ -1,58 +1,68 @@
-import gradio as gr
-import torch
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
-# Load the model and processor on available device(s)
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen2-VL-72B-Instruct-AWQ",
-    torch_dtype=torch.float16,
-    #device_map="auto"
-)
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct-AWQ")
-@spaces.GPU(duration=60)
-def generate_caption(image, prompt):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,  # The uploaded image
-                },
-                {"type": "text", "text": prompt},
-            ],
-        }
-    ]
-    # Prepare the input
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt"
-    )
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    inputs = inputs.to(device)
-    # Generate the output
-    generated_ids = model.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    return output_text[0]
-# Launch the Gradio interface with the updated inference function and title
-demo = gr.ChatInterface(fn=generate_caption, title="Qwen2-VL-72B-Instruct-OCR", multimodal=True, description="Upload your Image and get the best possible insights out of the Image")
-demo.queue().launch()

+import spaces
+import gradio as gr
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+device = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_REPO = "Qwen/Qwen2-VL-72B-Instruct-AWQ"
+#MODEL_REPO = "Qwen/Qwen2-VL-7B-Instruct"
+# Load the model and processor on available device(s)
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_REPO,
+    torch_dtype=torch.float16,
+    #device_map="auto"
+)#.to(device)
+processor = AutoProcessor.from_pretrained(MODEL_REPO)
+@spaces.GPU(duration=60)
+def generate_caption(message, history, system_prompt, max_new_tokens):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": message.get("text", "")}
+            ]
+        }
+    ]
+    for image in message["files"]:
+        messages["content"].append({"type": "image", "image": image})  # The uploaded image
+    # Prepare the input
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt"
+    )
+    inputs.to(device)
+    #model.to(device)
+    # Generate the output
+    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return output_text[0]
+# Launch the Gradio interface with the updated inference function and title
+with gr.Blocks() as demo:
+    system_prompt = gr.Textbox("You are helpful AI.", label="System Prompt", render=False)
+    tokens = gr.Slider(minimum=1, maximum=4096, value=128, step=1, label="Max new tokens", render=False)
+    gr.ChatInterface(fn=generate_caption, title="Qwen2-VL-72B-Instruct-OCR", multimodal=True,
+                    additional_inputs=[system_prompt, tokens],
+                    description="Upload your Image and get the best possible insights out of the Image")
+demo.queue().launch()

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
-huggingface_hub
-#torch==2.3.1
-torchvision==0.18.1
-accelerate
-qwen-vl-utils
-autoawq
 git+https://github.com/huggingface/transformers

+spaces>=0.30.3
+huggingface_hub
+torch
+torchvision
+accelerate
+qwen-vl-utils
+git+https://github.com/casper-hansen/AutoAWQ
 git+https://github.com/huggingface/transformers