ronniet commited on
Commit
04b62bf
1 Parent(s): 946a993

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -6
app.py CHANGED
@@ -1,23 +1,72 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
- from TTS.api import TTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  captioner = pipeline(model="microsoft/git-base")
6
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
7
 
8
 
9
  def predict(image):
10
  text = captioner(image)[0]["generated_text"]
11
 
12
- audio_output = "output.wav"
13
- tts.tts_to_file(text, speaker=tts.speakers[0], language="en", file_path=audio_output)
 
14
 
15
- return text, audio_output
16
 
17
  demo = gr.Interface(
18
  fn=predict,
19
  inputs=gr.Image(type="pil"),
20
- outputs=['text', gr.Audio()]
21
  )
22
 
23
  demo.launch()
 
1
  import gradio as gr
2
  from transformers import pipeline
3
+ # from TTS.api import TTS
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import torch
8
+
9
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
+
11
+
12
+ checkpoint = "microsoft/speecht5_tts"
13
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
14
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
15
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16
+
17
+ def tts(text):
18
+ if len(text.strip()) == 0:
19
+ return (16000, np.zeros(0).astype(np.int16))
20
+
21
+ inputs = processor(text=text, return_tensors="pt")
22
+
23
+ # limit input length
24
+ input_ids = inputs["input_ids"]
25
+ input_ids = input_ids[..., :model.config.max_text_positions]
26
+
27
+ # if speaker == "Surprise Me!":
28
+ # # load one of the provided speaker embeddings at random
29
+ # idx = np.random.randint(len(speaker_embeddings))
30
+ # key = list(speaker_embeddings.keys())[idx]
31
+ # speaker_embedding = np.load(speaker_embeddings[key])
32
+
33
+ # # randomly shuffle the elements
34
+ # np.random.shuffle(speaker_embedding)
35
+
36
+ # # randomly flip half the values
37
+ # x = (np.random.rand(512) >= 0.5) * 1.0
38
+ # x[x == 0] = -1.0
39
+ # speaker_embedding *= x
40
+
41
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
42
+ # else:
43
+ speaker_embedding = np.load("cmu_us_bdl_arctic-wav-arctic_a0009.npy")
44
+
45
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
46
+
47
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
48
+
49
+ speech = (speech.numpy() * 32767).astype(np.int16)
50
+ return (16000, speech)
51
+
52
 
53
  captioner = pipeline(model="microsoft/git-base")
54
+ # tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
55
 
56
 
57
  def predict(image):
58
  text = captioner(image)[0]["generated_text"]
59
 
60
+ # audio_output = "output.wav"
61
+ # tts.tts_to_file(text, speaker=tts.speakers[0], language="en", file_path=audio_output)
62
+ audio = tts(text)
63
 
64
+ return text, audio
65
 
66
  demo = gr.Interface(
67
  fn=predict,
68
  inputs=gr.Image(type="pil"),
69
+ outputs=['text', gr.Audio(type="numpy")]
70
  )
71
 
72
  demo.launch()