import gradio as gr import torch import torch.nn.functional as F from transformers import Blip2Processor, Blip2ForConditionalGeneration from PIL import Image from peft import LoraConfig, get_peft_model # Initialize the processor and model processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") # model_path = "full-blip2-deit-config-yes-no-2.pth" # model = torch.load("./full-blip2-deit-config-2.pth") # model = torch.load("./full-blip2-deit.pth") # not working - error # model = torch.load("./full-blip2-deit-config-free-form-4-ver-2.pth") model = torch.load("./full_config_blip2-deit-05") model.eval() # Set the model to evaluation mode device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def preprocess_image(image): """Preprocess the image to match the model's input requirements.""" # Convert PIL image to tensor pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device) # Apply specific model's preprocessing patch_embeddings = model.vision_model.embeddings.patch_embeddings.projection(pixel_values) patch_embeddings_flat = patch_embeddings.view(1, -1, 1408) cls_token = model.vision_model.embeddings.cls_token.expand(1, -1, -1) dist_token = model.vision_model.embeddings.distillation_token.expand(1, -1, -1) full_embeddings = torch.cat([cls_token, dist_token, patch_embeddings_flat], dim=1) encoder_outputs = model.vision_model.encoder(full_embeddings) image_outputs = encoder_outputs.last_hidden_state image_outputs = F.adaptive_avg_pool2d(image_outputs, (3, 50176)) image_outputs = image_outputs.view(1, 3, 224, 224) # Adjusted dimensions return image_outputs def generate_answer_blip2(image, question): """Generate answers based on an image and a question using a BLIP2 model.""" image_outputs = preprocess_image(image) # Prepare question question_formatted = "Question: " + question + " Answer:" inputs = processor(text=question_formatted, return_tensors="pt") inputs['pixel_values'] = image_outputs.to(device) # Ensure image tensor is on the correct device # Generate response using the model generated_ids = model.generate(**inputs, max_length=50) generated_answer = processor.batch_decode(generated_ids, skip_special_tokens=True) return generated_answer[0] # Return the first (and typically only) generated answer # Function to display the demo interface def show_demo(): return ( gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) ) # Setting up the Gradio interface with Blocks with gr.Blocks() as landing_page: gr.Markdown("# Welcome to the Visual Question Answering Demo") gr.Markdown("This demo uses the customized BLIP2 model to answer questions about images.") gr.Markdown("### How to Use: ") gr.Markdown("1. Upload an image. \n2. Enter a question related to the image. \n3. Receive the generated answer.") gr.Markdown("### Model Information: ") gr.Markdown("The BLIP2 model combines vision and language understanding to generate answers based on the provided image and question.") with gr.Column() as demo_column: start_demo_button = gr.Button("Start Demo") image_input = gr.Image(label="Upload Image", visible=False) question_input = gr.Textbox(label="Enter your question", visible=False) submit_button = gr.Button("Submit", visible=False) clear_button = gr.Button("Clear", visible=False) answer_output = gr.Textbox(label="Generated Answer", visible=False) start_demo_button.click(fn=show_demo, inputs=None, outputs=[image_input, question_input, submit_button, clear_button, answer_output]) def generate_and_show_answer(image, question): return generate_answer_blip2(image, question) submit_button.click(fn=generate_and_show_answer, inputs=[image_input, question_input], outputs=answer_output) clear_button.click(fn=lambda: (None, "", "", ""), inputs=None, outputs=[image_input, question_input, answer_output, answer_output]) if __name__ == "__main__": landing_page.launch()