Spaces:

usernameisanna
/

BLIP2-DeiT-VQA

Sleeping

App Files Files Community

BLIP2-DeiT-VQA / app.py

usernameisanna

Update app.py

d8ca139 verified 4 months ago

raw

history blame

No virus

2.82 kB

	import gradio as gr
	import torch
	import torch.nn.functional as F
	from transformers import Blip2Processor, Blip2ForConditionalGeneration
	from PIL import Image
	from peft import LoraConfig, get_peft_model


	# Initialize the processor and model
	processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
	# model_path = "full-blip2-deit-config-yes-no-2.pth"
	# model = torch.load("./full-blip2-deit-config-2.pth")
	# model = torch.load("./full-blip2-deit.pth") # not working - error
	model = torch.load("./full-blip2-deit-config-free-form-4-ver-2.pth")

	model.eval() # Set the model to evaluation mode
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	def preprocess_image(image):
	"""Preprocess the image to match the model's input requirements."""
	# Convert PIL image to tensor
	pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

	# Apply specific model's preprocessing
	patch_embeddings = model.vision_model.embeddings.patch_embeddings.projection(pixel_values)
	patch_embeddings_flat = patch_embeddings.view(1, -1, 1408)

	cls_token = model.vision_model.embeddings.cls_token.expand(1, -1, -1)
	dist_token = model.vision_model.embeddings.distillation_token.expand(1, -1, -1)
	full_embeddings = torch.cat([cls_token, dist_token, patch_embeddings_flat], dim=1)

	encoder_outputs = model.vision_model.encoder(full_embeddings)
	image_outputs = encoder_outputs.last_hidden_state

	image_outputs = F.adaptive_avg_pool2d(image_outputs, (3, 50176))
	image_outputs = image_outputs.view(1, 3, 224, 224) # Adjusted dimensions
	return image_outputs

	def generate_answer_blip2(image, question):
	"""Generate answers based on an image and a question using a BLIP2 model."""
	image_outputs = preprocess_image(image)

	# Prepare question
	question_formatted = "Question: " + question + " Answer:"
	inputs = processor(text=question_formatted, return_tensors="pt")
	inputs['pixel_values'] = image_outputs.to(device) # Ensure image tensor is on the correct device

	# Generate response using the model
	generated_ids = model.generate(**inputs, max_length=50)
	generated_answer = processor.batch_decode(generated_ids, skip_special_tokens=True)

	return generated_answer[0] # Return the first (and typically only) generated answer

	# Setting up the Gradio interface
	iface = gr.Interface(
	fn=generate_answer_blip2,
	inputs=[gr.Image(label="Upload Image"), gr.Textbox(label="Enter your question")],
	outputs=gr.Textbox(label="Generated Answer"),
	title="Visual Question Answering with DeiT-BLIP2 Model",
	description="Upload an image and type a related question to receive an answer generated by the model."
	)

	if __name__ == "__main__":
	iface.launch()