social-rise
/

blip-vqa-capfilt-large

Visual Question Answering

Inference Endpoints

Model card Files Files and versions Community

blip-vqa-capfilt-large / handler.py

sergeymal's picture

Made request to use single image only.

33408da 4 months ago

history blame contribute delete

2.31 kB

	import requests
	from typing import Dict, Any
	from PIL import Image
	import torch
	import base64
	from io import BytesIO
	from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
	from dotenv import load_dotenv
	import os

	load_dotenv()

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


	class EndpointHandler():
	def __init__(self, path=""):
	self.processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
	self.model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large").to(device)
	self.model.eval()

	def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
	if os.getenv("SECRET_TOKEN") and os.getenv("SECRET_TOKEN") != data.get("secret_token"):
	return {"captions": [], "error": "Invalid secret token"}

	input_data = data.get("inputs", {})
	input_image = input_data.get("image")

	if not input_image:
	return {"answers": [], "error": "No image provided"}

	questions = input_data.get("questions")

	if not questions:
	return {"answers": [], "error": "No questions provided"}

	try:
	if input_image.get("base64"):
	raw_image = Image.open(BytesIO(base64.b64decode(input_image.get("base64")))).convert("RGB")
	elif input_image.get("url"):
	raw_image = Image.open(BytesIO(requests.get(input_image.get("url")).content)).convert("RGB")
	else:
	return {"answers": [], "error": "Invalid image input"}

	answers = []

	for question in questions:
	# Process the image and question
	processed_input = self.processor(raw_image, question, return_tensors="pt").to(device)

	# Generate the answer
	out = self.model.generate(**processed_input)

	# Decode the answer
	answer = self.processor.batch_decode(out, skip_special_tokens=True)[0]

	# Add the answer to the list for the current image
	answers.append(answer)

	return {"answers": answers}

	except Exception as e:
	print(f"Error during processing: {str(e)}")
	return {"captions": [], "error": str(e)}