import gradio as gr from transformers import ( PaliGemmaProcessor, PaliGemmaForConditionalGeneration, ) from transformers.image_utils import load_image import torch import os import spaces # Import the spaces module import requests from io import BytesIO from PIL import Image def load_model(): """Load PaliGemma2 model and processor with Hugging Face token.""" token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Retrieve token from environment variable if not token: raise ValueError( "Hugging Face API token not found. Please set it in the environment variables." ) # Load the processor and model using the correct identifier model_id = "google/paligemma2-10b-pt-448" processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token) device = "cuda" if torch.cuda.is_available() else "cpu" model = PaliGemmaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, use_auth_token=token ).to(device).eval() return processor, model @spaces.GPU(duration=120) # Increased timeout to 120 seconds def process_image_and_text(image_pil, num_beams, temperature, seed): """Extract text from image using PaliGemma2.""" try: processor, model = load_model() device = "cuda" if torch.cuda.is_available() else "cpu" # Load the image using load_image image = load_image(image_pil) # Add token to the beginning of the text prompt text_input = " " # Use the provided text input model_inputs = processor(text=text_input, images=image, return_tensors="pt").to( device, dtype=torch.bfloat16 ) input_len = model_inputs["input_ids"].shape[-1] torch.manual_seed(seed) # Set random seed for reproducibility with torch.inference_mode(): generation = model.generate(**model_inputs, max_new_tokens=200, do_sample=True, num_beams=num_beams, temperature=temperature) generation = generation[0][input_len:] decoded = processor.decode(generation, skip_special_tokens=True) return decoded except Exception as e: print(f"Error during GPU task: {e}") raise gr.Error(f"GPU task failed: {e}") if __name__ == "__main__": iface = gr.Interface( fn=process_image_and_text, inputs=[ gr.Image(type="pil", label="Upload an image"), gr.Slider(minimum=1, maximum=10, step=1, value=10, label="Number of Beams"), gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"), gr.Number(label="Random Seed", value=0, precision=0), ], outputs=gr.Textbox(label="Generated Text"), title="PaliGemma2 Image to Text", description="Upload an image and the model will generate text.", ) iface.launch()