Spaces:

saja003
/

VQA-with-Audio

Sleeping

App Files Files Community

saja003 commited on Sep 26, 2024

Commit

314df46

verified ·

1 Parent(s): 38a16e5

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -0

app.py CHANGED Viewed

@@ -8,3 +8,39 @@ import os  # For file handling
 model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
 processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

 model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
 processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+# Define the function that handles the image and question input, and returns an audio response
+def answer_question_with_audio(image, question):
+    # If the input is a file path, open the image
+    if isinstance(image, str):
+        image = Image.open(image)
+    # Process the image and question using the processor to get inputs for the model
+    inputs = processor(image, question, return_tensors="pt")
+    # Generate the model's response to the question
+    out = model.generate(**inputs)
+    # Decode the model's output to get a human-readable answer
+    answer_text = processor.decode(out[0], skip_special_tokens=True)
+    # Convert the text answer to audio using gTTS
+    tts = gTTS(text=answer_text, lang='en')
+    # Save the audio file
+    audio_path = "answer.mp3"
+    tts.save(audio_path)
+    # Return the path to the audio file
+    return audio_path
+# Create a Gradio interface with image and text inputs, and an audio output
+interface = gr.Interface(
+    fn=answer_question_with_audio,  # Function to call when the interface is used
+    inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")],  # Inputs: Image and Textbox
+    outputs=gr.Audio(label="Answer (Audio)"),  # Output: Audio response
+    title="Visual Question Answering with Audio",  # Title of the interface
+    description="Upload an image and ask a question. The answer will be provided as an audio response."  # Description
+)
+# Launch the Gradio interface with public sharing enabled
+interface.launch(share=True)