Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from PIL import Image
|
3 |
+
from transformers import pipeline
|
4 |
+
import scipy.io.wavfile as wavfile
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
|
8 |
+
|
9 |
+
Narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
10 |
+
|
11 |
+
|
12 |
+
def generate_audio(text):
|
13 |
+
# Generate speech from the input text using the Narrator (VITS model)
|
14 |
+
Narrated_Text = Narrator(text)
|
15 |
+
|
16 |
+
# Extract the audio data and sampling rate
|
17 |
+
audio_data = np.array(Narrated_Text["audio"][0])
|
18 |
+
sampling_rate = Narrated_Text["sampling_rate"]
|
19 |
+
|
20 |
+
# Save the generated speech as a WAV file
|
21 |
+
wavfile.write("generated_audio.wav", rate=sampling_rate, data=audio_data)
|
22 |
+
|
23 |
+
# Return the filename of the saved audio file
|
24 |
+
return "generated_audio.wav"
|
25 |
+
|
26 |
+
|
27 |
+
def caption_my_image(pil_image):
|
28 |
+
# Use BLIP to generate a text description of the input image
|
29 |
+
semantics = caption_image(images=pil_image)[0]["generated_text"]
|
30 |
+
|
31 |
+
# Generate audio from the text description
|
32 |
+
return generate_audio(semantics)
|
33 |
+
|
34 |
+
|
35 |
+
# Main functionality tab
|
36 |
+
main_tab = gr.Interface(
|
37 |
+
fn=caption_my_image,
|
38 |
+
inputs=[gr.Image(label="Select Image", type="pil")],
|
39 |
+
outputs=[gr.Audio(label="Generated Audio")],
|
40 |
+
title="Image Audio Captioning App",
|
41 |
+
description="Upload an image, and the app will generate a textual description and convert it into speech."
|
42 |
+
)
|
43 |
+
|
44 |
+
# Information tab
|
45 |
+
info_tab = gr.Markdown("""
|
46 |
+
# Image Audio Captioning App
|
47 |
+
|
48 |
+
### Purpose
|
49 |
+
This application is designed to assist visually impaired users by providing audio descriptions of images. It can also be used in various scenarios such as creating audio captions for educational materials, enhancing accessibility for digital content, and more.
|
50 |
+
|
51 |
+
### How to Use
|
52 |
+
- **Step 1:** Click on the 'Select Image' button to upload an image.
|
53 |
+
- **Step 2:** Wait for the application to generate the audio description.
|
54 |
+
- **Step 3:** Listen to the generated audio file.
|
55 |
+
|
56 |
+
### Limits
|
57 |
+
- The quality of the description depends on the image clarity and content.
|
58 |
+
- The application might not work well with images that have complex scenes or unclear subjects.
|
59 |
+
- Audio generation time may vary depending on the input image size and content.
|
60 |
+
|
61 |
+
### Note
|
62 |
+
- Ensure the uploaded image is clear and well-defined for the best results.
|
63 |
+
- This app is a prototype and may have limitations in real-world applications.
|
64 |
+
""")
|
65 |
+
|
66 |
+
# Combine both tabs into a single app
|
67 |
+
demo = gr.TabbedInterface(
|
68 |
+
[main_tab, info_tab],
|
69 |
+
tab_names=["Main", "Information"]
|
70 |
+
)
|
71 |
+
|
72 |
+
demo.launch()
|