ylacombe HF staff commited on
Commit
b8391d0
1 Parent(s): ae05283

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -18
app.py CHANGED
@@ -13,13 +13,10 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
 
15
  repo_id = "parler-tts/parler-tts-mini-v1"
16
- repo_id_large = "parler-tts/parler-tts-large-v1"
17
- repo_id_tiny = "parler-tts/parler-tts-tiny-v1"
18
 
19
  model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
20
  model_large = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_large).to(device)
21
- model_tiny = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_tiny).to(device)
22
-
23
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
24
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
25
 
@@ -33,27 +30,27 @@ examples = [
33
  [
34
  "This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
35
  "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
36
- "Mini",
37
  ],
38
  [
39
  '''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
40
  "Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
41
- "Mini"
42
  ],
43
  [
44
  "'This is the best time of my life, Bartley,' she said happily.",
45
  "A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
46
- "Mini",
47
  ],
48
  [
49
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
50
  "A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
51
- "Mini"
52
  ],
53
  [
54
  "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
55
  "In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
56
- "Mini",
57
  ],
58
  ]
59
 
@@ -79,21 +76,17 @@ def preprocess(text):
79
  return text
80
 
81
  @spaces.GPU
82
- def gen_tts(text, description, version_to_use=False):
83
  inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
84
  prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
85
 
86
  set_seed(SEED)
87
- if version_to_use=="Large":
88
  generation = model_large.generate(
89
  input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
90
  )
91
- elif version_to_use=="Miny":
92
- generation = model.generate(
93
- input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
94
- )
95
  else:
96
- generation = model_tiny.generate(
97
  input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
98
  )
99
  audio_arr = generation.cpu().numpy().squeeze()
@@ -170,12 +163,12 @@ with gr.Blocks(css=css) as block:
170
  with gr.Column():
171
  input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
172
  description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
173
- version_to_use = gr.Radio(["Tiny", "Mini", "Large"], value="Mini", label="Checkpoint to use", info="The larger the model, the better it is, at the cost of speed.")
174
  run_button = gr.Button("Generate Audio", variant="primary")
175
  with gr.Column():
176
  audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
177
 
178
- inputs = [input_text, description, version_to_use]
179
  outputs = [audio_out]
180
  run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
181
  gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
 
13
 
14
 
15
  repo_id = "parler-tts/parler-tts-mini-v1"
16
+ repo_id_large = "ylacombe/parler-large-v1-og"
 
17
 
18
  model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
19
  model_large = ParlerTTSForConditionalGeneration.from_pretrained(repo_id_large).to(device)
 
 
20
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
21
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
22
 
 
30
  [
31
  "This version introduces speaker consistency across generations, characterized by their name. For example, Jon, Lea, Gary, Jenna, Mike and Laura.",
32
  "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.",
33
+ None,
34
  ],
35
  [
36
  '''There's 34 speakers. To take advantage of this, simply adapt your text description to specify which speaker to use: "Mike speaks animatedly...".''',
37
  "Gary speaks slightly animatedly and slightly slowly in delivery, with a very close recording that has no background noise.",
38
+ None
39
  ],
40
  [
41
  "'This is the best time of my life, Bartley,' she said happily.",
42
  "A female speaker delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice and slight background noise, creating a close-sounding audio experience.",
43
+ None,
44
  ],
45
  [
46
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
47
  "A man voice speaks slightly slowly with very noisy background, carrying a low-pitch tone and displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
48
+ None
49
  ],
50
  [
51
  "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her pal-ace window, which had a carved frame of black wood.",
52
  "In a very poor recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average.",
53
+ None,
54
  ],
55
  ]
56
 
 
76
  return text
77
 
78
  @spaces.GPU
79
+ def gen_tts(text, description, use_large=False):
80
  inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
81
  prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
82
 
83
  set_seed(SEED)
84
+ if use_large:
85
  generation = model_large.generate(
86
  input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
87
  )
 
 
 
 
88
  else:
89
+ generation = model.generate(
90
  input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
91
  )
92
  audio_arr = generation.cpu().numpy().squeeze()
 
163
  with gr.Column():
164
  input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
165
  description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
166
+ use_large = gr.Checkbox(value=False, label="Use Large checkpoint", info="Generate with Parler-TTS Large v1 instead of Mini v1 - Better but way slower.")
167
  run_button = gr.Button("Generate Audio", variant="primary")
168
  with gr.Column():
169
  audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
170
 
171
+ inputs = [input_text, description, use_large]
172
  outputs = [audio_out]
173
  run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
174
  gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)