saja003 commited on
Commit
314df46
·
verified ·
1 Parent(s): 38a16e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -0
app.py CHANGED
@@ -8,3 +8,39 @@ import os # For file handling
8
  model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
9
  processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
9
  processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
10
 
11
+ # Define the function that handles the image and question input, and returns an audio response
12
+ def answer_question_with_audio(image, question):
13
+ # If the input is a file path, open the image
14
+ if isinstance(image, str):
15
+ image = Image.open(image)
16
+
17
+ # Process the image and question using the processor to get inputs for the model
18
+ inputs = processor(image, question, return_tensors="pt")
19
+
20
+ # Generate the model's response to the question
21
+ out = model.generate(**inputs)
22
+
23
+ # Decode the model's output to get a human-readable answer
24
+ answer_text = processor.decode(out[0], skip_special_tokens=True)
25
+
26
+ # Convert the text answer to audio using gTTS
27
+ tts = gTTS(text=answer_text, lang='en')
28
+
29
+ # Save the audio file
30
+ audio_path = "answer.mp3"
31
+ tts.save(audio_path)
32
+
33
+ # Return the path to the audio file
34
+ return audio_path
35
+
36
+ # Create a Gradio interface with image and text inputs, and an audio output
37
+ interface = gr.Interface(
38
+ fn=answer_question_with_audio, # Function to call when the interface is used
39
+ inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")], # Inputs: Image and Textbox
40
+ outputs=gr.Audio(label="Answer (Audio)"), # Output: Audio response
41
+ title="Visual Question Answering with Audio", # Title of the interface
42
+ description="Upload an image and ask a question. The answer will be provided as an audio response." # Description
43
+ )
44
+
45
+ # Launch the Gradio interface with public sharing enabled
46
+ interface.launch(share=True)