llava-onevision

Running on Zero

App Files Files Community

RaushanTurganbay HF staff commited on Aug 20

Commit

04d3cd3

•

1 Parent(s): 3614a58

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -10

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
 from threading import Thread
 import re
 import time
@@ -8,17 +8,13 @@ import torch
 import cv2
 import spaces
-model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
 processor = LlavaProcessor.from_pretrained(model_id)
 model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
 model.to("cuda")
-def replace_video_with_images(text, frames):
-  return text.replace("<video>", "<image>" * frames)
 def sample_frames(video_file, num_frames):
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -63,8 +59,7 @@ def bot_streaming(message, history):
     if image[0].endswith(video_extensions):
         image = sample_frames(image[0], 12)
-        image_tokens = "<image>" * 13
-        prompt = f"<|im_start|>user {image_tokens}\n{message.text}<|im_end|><|im_start|>assistant"
     elif image[0].endswith(image_extensions):
         image = Image.open(image[0]).convert("RGB")
         prompt = f"<|im_start|>user <image>\n{message.text}<|im_end|><|im_start|>assistant"
@@ -109,7 +104,7 @@ def bot_streaming(message, history):
     yield generated_text_without_prompt
-demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Interleave", examples=[
      {"text": "The input contains two videos, are the cats in this video and this video doing the same thing?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
     {"text": "There are two images in the input. What is the relationship between this image and this image?", "files":["./bee.jpg", "./depth-bee.png"]},
      {"text": "What are these cats doing?", "files":["./cats.mp4"]},
@@ -117,6 +112,6 @@ demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Interleave", examples=[
     {"text": "What is on the flower?", "files":["./bee.jpg"]},
     {"text": "How to make this pastry?", "files":["./baklava.png"]}],
       textbox=gr.MultimodalTextbox(file_count="multiple"),
-      description="Try [LLaVA Interleave](https://huggingface.co/docs/transformers/main/en/model_doc/llava) in this demo (more specifically, the [Qwen-1.5-0.5B variant](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
       stop_btn="Stop Generation", multimodal=True)
 demo.launch(debug=True)

 import gradio as gr
+from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration, TextIteratorStreamer
 from threading import Thread
 import re
 import time
 import cv2
 import spaces
+model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
 processor = LlavaProcessor.from_pretrained(model_id)
 model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
 model.to("cuda")
 def sample_frames(video_file, num_frames):
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     if image[0].endswith(video_extensions):
         image = sample_frames(image[0], 12)
+        prompt = f"<|im_start|>user <video>\n{message.text}<|im_end|><|im_start|>assistant"
     elif image[0].endswith(image_extensions):
         image = Image.open(image[0]).convert("RGB")
         prompt = f"<|im_start|>user <image>\n{message.text}<|im_end|><|im_start|>assistant"
     yield generated_text_without_prompt
+demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Onevision", examples=[
      {"text": "The input contains two videos, are the cats in this video and this video doing the same thing?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
     {"text": "There are two images in the input. What is the relationship between this image and this image?", "files":["./bee.jpg", "./depth-bee.png"]},
      {"text": "What are these cats doing?", "files":["./cats.mp4"]},
     {"text": "What is on the flower?", "files":["./bee.jpg"]},
     {"text": "How to make this pastry?", "files":["./baklava.png"]}],
       textbox=gr.MultimodalTextbox(file_count="multiple"),
+      description="Try [LLaVA Onevision](https://huggingface.co/docs/transformers/main/en/model_doc/llava_onevision) in this demo (more specifically, the [Qwen-2-0.5B-Instruct variant](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
       stop_btn="Stop Generation", multimodal=True)
 demo.launch(debug=True)