sergeymal's picture
Made request to use single image only.
33408da
import requests
from typing import Dict, Any
from PIL import Image
import torch
import base64
from io import BytesIO
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
from dotenv import load_dotenv
import os
load_dotenv()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler():
def __init__(self, path=""):
self.processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
self.model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
self.model.eval()
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
if os.getenv("SECRET_TOKEN") and os.getenv("SECRET_TOKEN") != data.get("secret_token"):
return {"captions": [], "error": "Invalid secret token"}
input_data = data.get("inputs", {})
input_image = input_data.get("image")
if not input_image:
return {"answers": [], "error": "No image provided"}
questions = input_data.get("questions")
if not questions:
return {"answers": [], "error": "No questions provided"}
try:
if input_image.get("base64"):
raw_image = Image.open(BytesIO(base64.b64decode(input_image.get("base64")))).convert("RGB")
elif input_image.get("url"):
raw_image = Image.open(BytesIO(requests.get(input_image.get("url")).content)).convert("RGB")
else:
return {"answers": [], "error": "Invalid image input"}
answers = []
for question in questions:
# Process the image and question
processed_input = self.processor(raw_image, question, return_tensors="pt").to(device)
# Generate the answer
out = self.model.generate(**processed_input)
# Decode the answer
answer = self.processor.batch_decode(out, skip_special_tokens=True)[0]
# Add the answer to the list for the current image
answers.append(answer)
return {"answers": answers}
except Exception as e:
print(f"Error during processing: {str(e)}")
return {"captions": [], "error": str(e)}