|
import streamlit as st |
|
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration |
|
from PIL import Image |
|
import torch |
|
import os |
|
|
|
def load_model(): |
|
"""Load PaliGemma2 model and processor with Hugging Face token.""" |
|
token = os.getenv("HUGGINGFACEHUB_API_TOKEN") |
|
if not token: |
|
raise ValueError("Hugging Face API token not found. Please set it in the environment variables.") |
|
processor = PaliGemmaProcessor.from_pretrained("google/paligemma2", token=token) |
|
model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2", token=token) |
|
return processor, model |
|
|
|
def process_image(image, processor, model): |
|
"""Extract text from image using PaliGemma2.""" |
|
|
|
inputs = processor(images=image, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
generated_ids = model.generate(**inputs) |
|
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
|
return text |