Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -13,13 +13,10 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
13 |
|
14 |
|
15 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
16 |
-
repo_id_large = "
|
17 |
-
repo_id_tiny = "parler-tts/parler-tts-tiny-v1"
|
18 |
|
19 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
20 |
model_large = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_large).to(device)
|
21 |
-
model_tiny = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_tiny).to(device)
|
22 |
-
|
23 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
24 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
25 |
|
@@ -33,27 +30,27 @@ examples = [
|
|
33 |
[
|
34 |
"This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
|
35 |
"Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
|
36 |
-
|
37 |
],
|
38 |
[
|
39 |
'''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
|
40 |
"Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
|
41 |
-
|
42 |
],
|
43 |
[
|
44 |
"'This is the best time of my life, Bartley,' she said happily.",
|
45 |
"A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
|
46 |
-
|
47 |
],
|
48 |
[
|
49 |
"Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
|
50 |
"A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
|
51 |
-
|
52 |
],
|
53 |
[
|
54 |
"Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
|
55 |
"In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
|
56 |
-
|
57 |
],
|
58 |
]
|
59 |
|
@@ -79,21 +76,17 @@ def preprocess(text):
|
|
79 |
return text
|
80 |
|
81 |
@spaces.GPU
|
82 |
-
def gen_tts(text, description,
|
83 |
inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
|
84 |
prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
|
85 |
|
86 |
set_seed(SEED)
|
87 |
-
if
|
88 |
generation = model_large.generate(
|
89 |
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
|
90 |
)
|
91 |
-
elif version_to_use=="Miny":
|
92 |
-
generation = model.generate(
|
93 |
-
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
|
94 |
-
)
|
95 |
else:
|
96 |
-
generation =
|
97 |
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
|
98 |
)
|
99 |
audio_arr = generation.cpu().numpy().squeeze()
|
@@ -170,12 +163,12 @@ with gr.Blocks(css=css) as block:
|
|
170 |
with gr.Column():
|
171 |
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
|
172 |
description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
|
173 |
-
|
174 |
run_button = gr.Button("Generate Audio", variant="primary")
|
175 |
with gr.Column():
|
176 |
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
|
177 |
|
178 |
-
inputs = [input_text, description,
|
179 |
outputs = [audio_out]
|
180 |
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
|
181 |
gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
|
|
|
13 |
|
14 |
|
15 |
repo_id = "parler-tts/parler-tts-mini-v1"
|
16 |
+
repo_id_large = "ylacombe/parler-large-v1-og"
|
|
|
17 |
|
18 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
19 |
model_large = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_large).to(device)
|
|
|
|
|
20 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
21 |
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
22 |
|
|
|
30 |
[
|
31 |
"This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
|
32 |
"Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
|
33 |
+
None,
|
34 |
],
|
35 |
[
|
36 |
'''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
|
37 |
"Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
|
38 |
+
None
|
39 |
],
|
40 |
[
|
41 |
"'This is the best time of my life, Bartley,' she said happily.",
|
42 |
"A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
|
43 |
+
None,
|
44 |
],
|
45 |
[
|
46 |
"Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
|
47 |
"A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
|
48 |
+
None
|
49 |
],
|
50 |
[
|
51 |
"Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
|
52 |
"In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
|
53 |
+
None,
|
54 |
],
|
55 |
]
|
56 |
|
|
|
76 |
return text
|
77 |
|
78 |
@spaces.GPU
|
79 |
+
def gen_tts(text, description, use_large=False):
|
80 |
inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
|
81 |
prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
|
82 |
|
83 |
set_seed(SEED)
|
84 |
+
if use_large:
|
85 |
generation = model_large.generate(
|
86 |
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
|
87 |
)
|
|
|
|
|
|
|
|
|
88 |
else:
|
89 |
+
generation = model.generate(
|
90 |
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
|
91 |
)
|
92 |
audio_arr = generation.cpu().numpy().squeeze()
|
|
|
163 |
with gr.Column():
|
164 |
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
|
165 |
description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
|
166 |
+
use_large = gr.Checkbox(value=False, label="Use Large checkpoint", info="Generate with Parler-TTS Large v1 instead of Mini v1 - Better but way slower.")
|
167 |
run_button = gr.Button("Generate Audio", variant="primary")
|
168 |
with gr.Column():
|
169 |
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
|
170 |
|
171 |
+
inputs = [input_text, description, use_large]
|
172 |
outputs = [audio_out]
|
173 |
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
|
174 |
gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
|