Spaces:

parler-tts
/

parler_tts

Running on Zero

App Files Files Community

ylacombe HF staff commited on Sep 30

Commit

b8391d0

•

1 Parent(s): ae05283

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -18

app.py CHANGED Viewed

@@ -13,13 +13,10 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
 repo_id =  "parler-tts/parler-tts-mini-v1"
-repo_id_large = "parler-tts/parler-tts-large-v1"
-repo_id_tiny =  "parler-tts/parler-tts-tiny-v1"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 model_large = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_large).to(device)
-model_tiny = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_tiny).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
@@ -33,27 +30,27 @@ examples = [
     [
         "This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
         "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
-        "Mini",
     ],
     [
         '''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
         "Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
-        "Mini"
     ],
     [
         "'This is the best time of my life, Bartley,' she said happily.",
         "A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
-        "Mini",
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
-        "Mini"
     ],
     [
         "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
         "In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
-        "Mini",
     ],
 ]
@@ -79,21 +76,17 @@ def preprocess(text):
     return text
 @spaces.GPU
-def gen_tts(text, description, version_to_use=False):
     inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
     prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
     set_seed(SEED)
-    if version_to_use=="Large":
         generation = model_large.generate(
             input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
         )
-    elif version_to_use=="Miny":
-        generation = model.generate(
-            input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
-        )
     else:
-        generation = model_tiny.generate(
             input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
         )
     audio_arr = generation.cpu().numpy().squeeze()
@@ -170,12 +163,12 @@ with gr.Blocks(css=css) as block:
         with gr.Column():
             input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
             description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
-            version_to_use = gr.Radio(["Tiny", "Mini", "Large"], value="Mini", label="Checkpoint to use", info="The larger the model, the better it is, at the cost of speed.")
             run_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
-    inputs = [input_text, description, version_to_use]
     outputs = [audio_out]
     run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
     gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)

 repo_id =  "parler-tts/parler-tts-mini-v1"
+repo_id_large = "ylacombe/parler-large-v1-og"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 model_large = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_large).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
     [
         "This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
         "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
+        None,
     ],
     [
         '''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
         "Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
+        None
     ],
     [
         "'This is the best time of my life, Bartley,' she said happily.",
         "A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
+        None,
     ],
     [
         "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
         "A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
+        None
     ],
     [
         "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
         "In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
+        None,
     ],
 ]
     return text
 @spaces.GPU
+def gen_tts(text, description, use_large=False):
     inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
     prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
     set_seed(SEED)
+    if use_large:
         generation = model_large.generate(
             input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
         )
     else:
+        generation = model.generate(
             input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
         )
     audio_arr = generation.cpu().numpy().squeeze()
         with gr.Column():
             input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
             description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
+            use_large = gr.Checkbox(value=False, label="Use Large checkpoint", info="Generate with Parler-TTS Large v1 instead of Mini v1 - Better but way slower.")
             run_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
             audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
+    inputs = [input_text, description, use_large]
     outputs = [audio_out]
     run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
     gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)