Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
-
|
2 |
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# Initialize Blip model for image captioning
|
6 |
model_id = "dblasko/blip-dalle3-img2prompt"
|
7 |
blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
|
8 |
blip_processor = BlipProcessor.from_pretrained(model_id)
|
9 |
|
10 |
-
# Initialize TTS model from Hugging Face
|
11 |
-
model_name = "tts-mozilla/tts-ljspeech-multilingual"
|
12 |
-
tts_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
13 |
-
tts_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
14 |
-
tts = pipeline(task="text2speech", model=tts_model, tokenizer=tts_tokenizer)
|
15 |
-
|
16 |
def generate_caption(image):
|
17 |
# Generate caption from image using Blip model
|
18 |
inputs = blip_processor(images=image, return_tensors="pt")
|
@@ -21,20 +22,30 @@ def generate_caption(image):
|
|
21 |
generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
|
22 |
|
23 |
# Use TTS model to convert generated caption to audio
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
return generated_caption, "
|
28 |
|
29 |
# Create a Gradio interface with an image input, a textbox output, a button, and an audio player
|
30 |
demo = gr.Interface(
|
31 |
-
fn=generate_caption,
|
32 |
-
inputs=gr.Image(),
|
33 |
outputs=[
|
34 |
-
gr.Textbox(label="Generated caption"),
|
35 |
-
gr.Button("Convert to Audio"),
|
36 |
gr.Audio(type="player", label="Generated Audio")
|
37 |
],
|
38 |
-
live=True
|
39 |
)
|
40 |
demo.launch(share=True)
|
|
|
1 |
+
from TTS.api import TTS
|
2 |
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
3 |
+
import torchaudio
|
4 |
+
from torchaudio.transforms import Resample
|
5 |
+
import torch
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
# Initialize TTS model from TTS library
|
9 |
+
tts_model_path = "tts_models/multilingual/multi-dataset/xtts_v2"
|
10 |
+
tts = TTS(tts_model_path, gpu=True)
|
11 |
|
12 |
# Initialize Blip model for image captioning
|
13 |
model_id = "dblasko/blip-dalle3-img2prompt"
|
14 |
blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
|
15 |
blip_processor = BlipProcessor.from_pretrained(model_id)
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def generate_caption(image):
|
18 |
# Generate caption from image using Blip model
|
19 |
inputs = blip_processor(images=image, return_tensors="pt")
|
|
|
22 |
generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
|
23 |
|
24 |
# Use TTS model to convert generated caption to audio
|
25 |
+
tts.tts_to_file(text=generated_caption,
|
26 |
+
file_path="generated_audio.wav",
|
27 |
+
speaker_wav="/path/to/target/speaker.wav",
|
28 |
+
language="en")
|
29 |
+
|
30 |
+
# Resample the audio to match the expected sampling rate
|
31 |
+
waveform, sample_rate = torchaudio.load("generated_audio.wav")
|
32 |
+
resampler = Resample(orig_freq=sample_rate, new_freq=24_000)
|
33 |
+
waveform_resampled = resampler(waveform)
|
34 |
+
|
35 |
+
# Save the resampled audio
|
36 |
+
torchaudio.save("generated_audio_resampled.wav", waveform_resampled, 24_000)
|
37 |
|
38 |
+
return generated_caption, "generated_audio_resampled.wav"
|
39 |
|
40 |
# Create a Gradio interface with an image input, a textbox output, a button, and an audio player
|
41 |
demo = gr.Interface(
|
42 |
+
fn=generate_caption,
|
43 |
+
inputs=gr.Image(),
|
44 |
outputs=[
|
45 |
+
gr.Textbox(label="Generated caption"),
|
46 |
+
gr.Button("Convert to Audio"),
|
47 |
gr.Audio(type="player", label="Generated Audio")
|
48 |
],
|
49 |
+
live=True
|
50 |
)
|
51 |
demo.launch(share=True)
|