explain-image / app.py
Sudeep s
keep login
00ffe08
raw
history blame
2.53 kB
import gradio as gr
import requests
import torch
from PIL import Image
import spaces
from transformers import MllamaForConditionalGeneration, AutoProcessor
import os
from huggingface_hub import login
huggingface_token = os.getenv("SECRET_ENV_VARIABLE")
login(huggingface_token)
# Load the Llama 3.2 Vision Model
def load_llama_model():
model_id = "meta-llama/Llama-3.2-11B-Vision"
# Load model and processor
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
offload_folder="offload",
)
model.tie_weights()
processor = AutoProcessor.from_pretrained(model_id)
return model, processor
# Function to generate predictions for text and image
@spaces.GPU
def process_input(text, image=None):
model, processor = load_llama_model()
if image:
# If an image is uploaded, process it as a PIL Image object
vision_input = image.convert("RGB").resize((224, 224))
prompt = f"<|image|><|begin_of_text|>{text}"
# Process image and text together
inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
else:
# If no image is uploaded, just process the text
prompt = f"<|begin_of_text|>{text}"
inputs = processor(prompt, return_tensors="pt").to(model.device)
# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=50)
# Decode the output to return a readable text
decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
return decoded_output
def demo():
# Define Gradio input and output components
text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
image_input = gr.Image(label="Upload an Image", type="pil")
output = gr.Textbox(label="Model Output", lines=3)
# Add two examples for multimodal analysis
examples = [
["Extract text from this image ", "./examples/text-image-1.jpg"]
]
# Define the interface layout
interface = gr.Interface(
fn=process_input,
inputs=[text_input, image_input],
outputs=output,
examples=examples,
title="Llama 3.2 Multimodal Text-Image Analyzer",
description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.",
)
# Launch the demo
interface.launch()
# Run the demo
if __name__ == "__main__":
demo()