Spaces:

TuringsSolutions
/

test-gpt-omni

Sleeping

App Files Files Community

TuringsSolutions commited on Oct 12, 2024

Commit

40652ca

verified ·

1 Parent(s): e09ac93

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -81

app.py CHANGED Viewed

@@ -1,94 +1,101 @@
 import gradio as gr
 import torch
-from llava.model.builder import load_pretrained_model
-from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
-from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
-from llava.conversation import conv_templates
-import copy
-from decord import VideoReader, cpu
-import numpy as np
-# Load the model
-pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
-model_name = "llava_qwen"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-device_map = "auto"
 print("Loading model...")
-tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
-model.eval()
 print("Model loaded successfully!")
-def load_video(video_path, max_frames_num, fps=1, force_sample=False):
-    if max_frames_num == 0:
-        return np.zeros((1, 336, 336, 3))
-    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
-    total_frame_num = len(vr)
-    video_time = total_frame_num / vr.get_avg_fps()
-    fps = round(vr.get_avg_fps()/fps)
-    frame_idx = [i for i in range(0, len(vr), fps)]
-    frame_time = [i/fps for i in frame_idx]
-    if len(frame_idx) > max_frames_num or force_sample:
-        sample_fps = max_frames_num
-        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
-        frame_idx = uniform_sampled_frames.tolist()
-        frame_time = [i/vr.get_avg_fps() for i in frame_idx]
-    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
-    spare_frames = vr.get_batch(frame_idx).asnumpy()
-    return spare_frames, frame_time, video_time
-def process_video(video_path, question):
-    max_frames_num = 64
-    video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
-    video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
-    video = [video]
-    conv_template = "qwen_1_5"
-    time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}. Please answer the following questions related to this video."
-    full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question}"
-    conv = copy.deepcopy(conv_templates[conv_template])
-    conv.append_message(conv.roles[0], full_question)
-    conv.append_message(conv.roles[1], None)
-    prompt_question = conv.get_prompt()
-    input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
-    with torch.no_grad():
-        output = model.generate(
-            input_ids,
-            images=video,
-            modalities=["video"],
-            do_sample=False,
-            temperature=0,
-            max_new_tokens=4096,
-        )
-    response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
-    return response
-def gradio_interface(video_file, question):
-    if video_file is None:
-        return "Please upload a video file."
-    response = process_video(video_file, question)
-    return response
-# Set up Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# 🌋📹 LLaVA-Video Chatbot")
-    with gr.Row():
-        with gr.Column():
-            video_input = gr.Video()
-            question_input = gr.Textbox(label="User Question", placeholder="Ask a question about the video...")
-            submit_button = gr.Button("Ask LLaVA-Video")
-        output = gr.Textbox(label="LLaVA-Video Response")
-    submit_button.click(
-        fn=gradio_interface,
-        inputs=[video_input, question_input],
-        outputs=output
     )
-if __name__ == "__main__":
-    demo.launch(show_error=True)

+import time
+from threading import Thread
 import gradio as gr
 import torch
+from PIL import Image
+from transformers import AutoProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
+# Model Configuration
+model_id = "xtuner/llava-llama-3-8b-v1_1-transformers"
 print("Loading model...")
+processor = AutoProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True
+)
+model.to("cuda" if torch.cuda.is_available() else "cpu")
+model.generation_config.eos_token_id = 128009
 print("Model loaded successfully!")
+PLACEHOLDER = """
+<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <img src="https://cdn-uploads.huggingface.co/production/uploads/64ccdc322e592905f922a06e/DDIW0kbWmdOQWwy4XMhwX.png"
+        style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">LLaVA-Llama-3-8B</h1>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">
+       Llava-Llama-3-8B is fine-tuned from Meta-Llama-3-8B-Instruct and CLIP-ViT-Large-patch14-336
+       using ShareGPT4V-PT and InternVL-SFT by XTuner.
+   </p>
+</div>
+"""
+def bot_streaming(message, history):
+    """Handles message processing with image and text streaming."""
+    try:
+        image = None
+        # Extract image from message or history
+        if message["files"]:
+            image = message["files"][-1]["path"] if isinstance(message["files"][-1], dict) else message["files"][-1]
+        else:
+            for hist in history:
+                if isinstance(hist[0], tuple):
+                    image = hist[0][0]
+        if not image:
+            return "Error: Please upload an image for LLaVA to work."
+        # Prepare inputs
+        image = Image.open(image)
+        prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|>"
+        inputs = processor(prompt, image, return_tensors="pt").to(device=model.device, dtype=torch.float16)
+        # Stream text generation
+        streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
+        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False)
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        time.sleep(0.5)  # Allow some time for initial generation
+        # Stream the generated response
+        for new_text in streamer:
+            if "<|eot_id|>" in new_text:
+                new_text = new_text.split("<|eot_id|>")[0]
+            buffer += new_text
+            yield buffer
+    except Exception as e:
+        yield f"Error: {str(e)}"
+# Define Gradio interface components
+chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1)
+chat_input = gr.MultimodalTextbox(
+    interactive=True, file_types=["image"], placeholder="Enter message or upload a file...", show_label=False
+)
+with gr.Blocks(fill_height=True) as demo:
+    gr.ChatInterface(
+        fn=bot_streaming,
+        title="LLaVA Llama-3-8B",
+        examples=[
+            {"text": "What is on the flower?", "files": ["./bee.jpg"]},
+            {"text": "How to make this pastry?", "files": ["./baklava.png"]}
+        ],
+        description=(
+            "Try [LLaVA Llama-3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers). "
+            "Upload an image and start chatting about it, or simply try one of the examples below. "
+            "If you don't upload an image, you will receive an error."
+        ),
+        stop_btn="Stop Generation",
+        multimodal=True,
+        textbox=chat_input,
+        chatbot=chatbot,
     )
+# Launch the Gradio app
+demo.queue(api_open=False)
+demo.launch(show_api=False, share=False)