kennethli319 commited on
Commit
e6e581b
·
1 Parent(s): 7b0aa8e

update app

Browse files
Files changed (1) hide show
  1. app.py +13 -109
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import torch
3
  import torchaudio
4
  import tempfile
5
- import logging
6
  import numpy as np
7
  from nemo.collections.tts.models import FastPitchModel
8
  from nemo.collections.tts.models import HifiGanModel
@@ -13,119 +13,26 @@ from transformers import pipeline
13
  # spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx")
14
  # model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx")
15
 
16
- spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker")
17
- spec_generator.eval()
18
- voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch")
19
- voc_model.eval()
20
-
21
- pipe = pipeline("text-to-speech", model="suno/bark-small")
22
 
23
  def greet(name):
24
  return "Hello " + name + "!!"
25
 
26
- def generate_tts(text: str, speaker: int = 0):
27
- sr = 44100
28
- parsed = spec_generator.parse(text)
29
- spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker)
30
- audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram)
31
-
32
- return (sr, audio.squeeze(0).cpu().numpy())
33
-
34
  def run():
35
- logging.basicConfig(level=logging.INFO)
36
-
37
- with gr.Blocks() as demo:
38
- gr.Markdown(
39
- """
40
- <h1 align="center">Balacoon🦝 Text-to-Speech</h1>
41
- 1. Write an utterance to generate,
42
- 2. Select the model to synthesize with
43
- 3. Select speaker
44
- 4. Hit "Generate" and listen to the result!
45
- You can learn more about models available
46
- [here](https://huggingface.co/balacoon/tts).
47
- Visit [Balacoon website](https://balacoon.com/) for more info.
48
- """
49
- )
50
- with gr.Row(variant="panel"):
51
- text = gr.Textbox(label="Text", placeholder="Type something here...")
52
-
53
- with gr.Row():
54
- with gr.Column(variant="panel"):
55
- repo_files = os.listdir(model_repo_dir)
56
- model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
57
- model_name = gr.Dropdown(
58
- label="Model",
59
- choices=model_files,
60
- )
61
- with gr.Column(variant="panel"):
62
- speaker = gr.Dropdown(label="Speaker", choices=[])
63
-
64
- def set_model(model_name_str: str):
65
- """
66
- gets value from `model_name`. either
67
- uses cached list of speakers for the given model name
68
- or loads the addon and checks what are the speakers.
69
- """
70
- global model_to_speakers
71
- if model_name_str in model_to_speakers:
72
- speakers = model_to_speakers[model_name_str]
73
- else:
74
- global tts, cur_model_path, locker
75
- with locker:
76
- # need to load this model to learn the list of speakers
77
- model_path = os.path.join(model_repo_dir, model_name_str)
78
- if tts is not None:
79
- del tts
80
- tts = TTS(model_path)
81
- cur_model_path = model_path
82
- speakers = tts.get_speakers()
83
- model_to_speakers[model_name_str] = speakers
84
-
85
- value = speakers[-1]
86
- return gr.Dropdown.update(
87
- choices=speakers, value=value, visible=True
88
- )
89
-
90
- model_name.change(set_model, inputs=model_name, outputs=speaker)
91
-
92
- with gr.Row(variant="panel"):
93
- generate = gr.Button("Generate")
94
- with gr.Row(variant="panel"):
95
- audio = gr.Audio()
96
-
97
- def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str):
98
- """
99
- gets utterance to synthesize from `text` Textbox
100
- and speaker name from `speaker` dropdown list.
101
- speaker name might be empty for single-speaker models.
102
- Synthesizes the waveform and updates `audio` with it.
103
- """
104
- if not text_str or not model_name_str or not speaker_str:
105
- logging.info("text, model name or speaker are not provided")
106
- return None
107
- expected_model_path = os.path.join(model_repo_dir, model_name_str)
108
- global tts, cur_model_path, locker
109
- with locker:
110
- if expected_model_path != cur_model_path:
111
- # reload model
112
- if tts is not None:
113
- del tts
114
- tts = TTS(expected_model_path)
115
- cur_model_path = expected_model_path
116
- if len(text_str) > 1024:
117
- # truncate the text
118
- text_str = text_str[:1024]
119
- samples = tts.synthesize(text_str, speaker_str)
120
- return gr.Audio.update(value=(tts.get_sampling_rate(), samples))
121
-
122
- generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio)
123
-
124
- demo.queue(concurrency_count=1).launch()
125
-
126
 
 
 
 
 
127
 
 
128
 
 
 
 
 
 
 
 
129
 
130
  demo = gr.Interface(
131
  fn=generate_tts,
@@ -134,9 +41,6 @@ def run():
134
  outputs=gr.Audio(label="Output", type="numpy"),
135
  )
136
 
137
- with gr.Row(variant="panel"):
138
- generate = gr.Button("Generate")
139
-
140
  demo.launch(server_name="0.0.0.0", server_port=7860)
141
 
142
 
 
2
  import torch
3
  import torchaudio
4
  import tempfile
5
+
6
  import numpy as np
7
  from nemo.collections.tts.models import FastPitchModel
8
  from nemo.collections.tts.models import HifiGanModel
 
13
  # spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx")
14
  # model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx")
15
 
 
 
 
 
 
 
16
 
17
  def greet(name):
18
  return "Hello " + name + "!!"
19
 
 
 
 
 
 
 
 
 
20
  def run():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker")
23
+ spec_generator.eval()
24
+ voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch")
25
+ voc_model.eval()
26
 
27
+ pipe = pipeline("text-to-speech", model="suno/bark-small")
28
 
29
+ def generate_tts(text: str, speaker: int = 0):
30
+ sr = 44100
31
+ parsed = spec_generator.parse(text)
32
+ spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker)
33
+ audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram)
34
+
35
+ return (sr, audio.squeeze(0).cpu().numpy())
36
 
37
  demo = gr.Interface(
38
  fn=generate_tts,
 
41
  outputs=gr.Audio(label="Output", type="numpy"),
42
  )
43
 
 
 
 
44
  demo.launch(server_name="0.0.0.0", server_port=7860)
45
 
46