moondream2-batch-processing

Runtime error

File size: 2,903 Bytes

6a8ca1f
 
 
 
 
04fc1f1
6a8ca1f
 
 
 
 
 
 
 
 
 
e9cc0b5
 
6a8ca1f
 
de50a7e
04fc1f1
ee5e19e
134e8f7
db2ea29
ee5e19e
db2ea29
3f71d24
db2ea29
 
 
 
 
 
 
 
471f9af
3f71d24
f2ab852
3f71d24
 
 
f2ab852
3f71d24
 
 
134e8f7
6a8ca1f
 
ee5e19e
471f9af
 
 
fefde70
6a8ca1f
 
69cfbe8
e9ecb71
69cfbe8
6a8ca1f
f2ab852
471f9af
3f71d24
6a8ca1f

import spaces
import torch
import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from PIL import Image

if torch.cuda.is_available():
    device, dtype = "cuda", torch.float16
else:
    device, dtype = "cpu", torch.float32

model_id = "vikhyatk/moondream2"
revision = "2024-04-02"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision, torch_dtype=dtype
).to(device=device)
moondream.eval()

@spaces.GPU
def answer_questions(image_tuples, prompt_text):
    result = ""
    Q_and_A = ""
    prompts = [p.strip() for p in prompt_text.split(',')]  # Splitting and cleaning prompts
    image_embeds = [img[0] for img in image_tuples if img[0] is not None]  # Extracting images from tuples, ignoring None

    print(f"\nprompts: {prompts}\n\n")
    answers = []
    for prompt in prompts:
        image_answers = moondream.batch_answer(
            images=[img.convert("RGB") for img in image_embeds],
            prompts=[prompt] * len(image_embeds),
            tokenizer=tokenizer,
        )
        answers.append(image_answers)
    
    for i, prompt in enumerate(prompts):
        Q_and_A += f"###Q: {prompt}\n"
        for j, image_tuple in enumerate(image_tuples):
            image_name = f"image{j+1}"
            answer_text = answers[i][j]  # Retrieve the answer for the i-th prompt for the j-th image
            Q_and_A += f"**{image_name} A:**\n{answer_text}\n\n"

    result = {'headers': prompts, 'data': answers}  # Updated result handling
    print(f"result\n{result}\n\nQ_and_A\n{Q_and_A}\n\n")
    return Q_and_A, result

with gr.Blocks() as demo:
    gr.Markdown("# moondream2 unofficial batch processing demo")
    gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
    gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each promp on each image**")
    gr.Markdown("*Running on free CPU space tier currently so results may take a bit to process compared to duplicating space and using GPU space hardware*")
    gr.Markdown("## 🌔 moondream2\nA tiny vision language model. [GitHub](https://github.com/vikhyatk/moondream)")
    with gr.Row():
        img = gr.Gallery(label="Upload Images", type="pil")
    with gr.Row():
        prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by commas. Ex: Describe this image, What is in this image?", lines=8)
    with gr.Row():
        submit = gr.Button("Submit")
    output = gr.Markdown(label="Questions and Answers")
    output2 = gr.Dataframe(label="Structured Dataframe", type="array",wrap=True)
    submit.click(answer_questions, [img, prompt], [output, output2])

demo.queue().launch()