Spaces:

sailfish
/

explain-image

Runtime error

explain-image / app.py

Sudeep s

keep login

00ffe08 5 months ago

2.53 kB

	import gradio as gr
	import requests
	import torch
	from PIL import Image
	import spaces
	from transformers import MllamaForConditionalGeneration, AutoProcessor
	import os
	from huggingface_hub import login

	huggingface_token = os.getenv("SECRET_ENV_VARIABLE")
	login(huggingface_token)

	# Load the Llama 3.2 Vision Model
	def load_llama_model():
	model_id = "meta-llama/Llama-3.2-11B-Vision"

	# Load model and processor
	model = MllamaForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	offload_folder="offload",
	)
	model.tie_weights()
	processor = AutoProcessor.from_pretrained(model_id)

	return model, processor

	# Function to generate predictions for text and image
	@spaces.GPU
	def process_input(text, image=None):
	model, processor = load_llama_model()

	if image:
	# If an image is uploaded, process it as a PIL Image object
	vision_input = image.convert("RGB").resize((224, 224))

	prompt = f"<\|image\|><\|begin_of_text\|>{text}"

	# Process image and text together
	inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
	else:
	# If no image is uploaded, just process the text
	prompt = f"<\|begin_of_text\|>{text}"
	inputs = processor(prompt, return_tensors="pt").to(model.device)

	# Generate output from the model
	outputs = model.generate(**inputs, max_new_tokens=50)

	# Decode the output to return a readable text
	decoded_output = processor.decode(outputs[0], skip_special_tokens=True)

	return decoded_output

	def demo():
	# Define Gradio input and output components
	text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
	image_input = gr.Image(label="Upload an Image", type="pil")
	output = gr.Textbox(label="Model Output", lines=3)

	# Add two examples for multimodal analysis
	examples = [
	["Extract text from this image ", "./examples/text-image-1.jpg"]
	]

	# Define the interface layout
	interface = gr.Interface(
	fn=process_input,
	inputs=[text_input, image_input],
	outputs=output,
	examples=examples,
	title="Llama 3.2 Multimodal Text-Image Analyzer",
	description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.",
	)

	# Launch the demo
	interface.launch()

	# Run the demo
	if __name__ == "__main__":
	demo()