Nepjune commited on
Commit
cf2cafe
·
verified ·
1 Parent(s): 93e78fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -16
app.py CHANGED
@@ -1,18 +1,19 @@
1
- import gradio as gr
2
  from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
- from transformers import AutoTokenizer
 
 
 
 
 
 
 
4
 
5
  # Initialize Blip model for image captioning
6
  model_id = "dblasko/blip-dalle3-img2prompt"
7
  blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
8
  blip_processor = BlipProcessor.from_pretrained(model_id)
9
 
10
- # Initialize TTS model from Hugging Face
11
- model_name = "tts-mozilla/tts-ljspeech-multilingual"
12
- tts_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
- tts_tokenizer = AutoTokenizer.from_pretrained(model_name)
14
- tts = pipeline(task="text2speech", model=tts_model, tokenizer=tts_tokenizer)
15
-
16
  def generate_caption(image):
17
  # Generate caption from image using Blip model
18
  inputs = blip_processor(images=image, return_tensors="pt")
@@ -21,20 +22,30 @@ def generate_caption(image):
21
  generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
22
 
23
  # Use TTS model to convert generated caption to audio
24
- audio_output = tts(generated_caption)
25
- audio_output.save_to_path("generated_audio.mp3")
 
 
 
 
 
 
 
 
 
 
26
 
27
- return generated_caption, "generated_audio.mp3"
28
 
29
  # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
30
  demo = gr.Interface(
31
- fn=generate_caption,
32
- inputs=gr.Image(),
33
  outputs=[
34
- gr.Textbox(label="Generated caption"),
35
- gr.Button("Convert to Audio"),
36
  gr.Audio(type="player", label="Generated Audio")
37
  ],
38
- live=True # ทำให้ Gradio ทำงานแบบไม่บล็อก
39
  )
40
  demo.launch(share=True)
 
1
+ from TTS.api import TTS
2
  from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
+ import torchaudio
4
+ from torchaudio.transforms import Resample
5
+ import torch
6
+ import gradio as gr
7
+
8
+ # Initialize TTS model from TTS library
9
+ tts_model_path = "tts_models/multilingual/multi-dataset/xtts_v2"
10
+ tts = TTS(tts_model_path, gpu=True)
11
 
12
  # Initialize Blip model for image captioning
13
  model_id = "dblasko/blip-dalle3-img2prompt"
14
  blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
15
  blip_processor = BlipProcessor.from_pretrained(model_id)
16
 
 
 
 
 
 
 
17
  def generate_caption(image):
18
  # Generate caption from image using Blip model
19
  inputs = blip_processor(images=image, return_tensors="pt")
 
22
  generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
23
 
24
  # Use TTS model to convert generated caption to audio
25
+ tts.tts_to_file(text=generated_caption,
26
+ file_path="generated_audio.wav",
27
+ speaker_wav="/path/to/target/speaker.wav",
28
+ language="en")
29
+
30
+ # Resample the audio to match the expected sampling rate
31
+ waveform, sample_rate = torchaudio.load("generated_audio.wav")
32
+ resampler = Resample(orig_freq=sample_rate, new_freq=24_000)
33
+ waveform_resampled = resampler(waveform)
34
+
35
+ # Save the resampled audio
36
+ torchaudio.save("generated_audio_resampled.wav", waveform_resampled, 24_000)
37
 
38
+ return generated_caption, "generated_audio_resampled.wav"
39
 
40
  # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
41
  demo = gr.Interface(
42
+ fn=generate_caption,
43
+ inputs=gr.Image(),
44
  outputs=[
45
+ gr.Textbox(label="Generated caption"),
46
+ gr.Button("Convert to Audio"),
47
  gr.Audio(type="player", label="Generated Audio")
48
  ],
49
+ live=True
50
  )
51
  demo.launch(share=True)