sanchit-gandhi HF staff commited on
Commit
c8a6713
1 Parent(s): efcdb1c
Files changed (1) hide show
  1. app.py +81 -13
app.py CHANGED
@@ -16,10 +16,14 @@ device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.
16
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
17
 
18
  repo_id = "parler-tts/parler_tts_mini_v0.1"
 
19
 
20
  model = ParlerTTSForConditionalGeneration.from_pretrained(
21
  repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
22
  ).to(device)
 
 
 
23
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
24
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
25
 
@@ -46,6 +50,25 @@ examples = [
46
  ],
47
  ]
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  class ParlerTTSStreamer(BaseStreamer):
50
  def __init__(
51
  self,
@@ -171,7 +194,33 @@ target_dtype = np.int16
171
  max_range = np.iinfo(target_dtype).max
172
 
173
  @spaces.GPU
174
- def generate_tts(text, description, play_steps_in_s=2.0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  play_steps = int(frame_rate * play_steps_in_s)
176
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
177
 
@@ -196,6 +245,7 @@ def generate_tts(text, description, play_steps_in_s=2.0):
196
  new_audio = (new_audio * max_range).astype(np.int16)
197
  yield sampling_rate, new_audio
198
 
 
199
  css = """
200
  #share-btn-container {
201
  display: flex;
@@ -264,18 +314,36 @@ with gr.Blocks(css=css) as block:
264
  </p>
265
  """
266
  )
267
- with gr.Row():
268
- with gr.Column():
269
- input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
270
- description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
271
- run_button = gr.Button("Generate Audio", variant="primary")
272
- with gr.Column():
273
- audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
274
-
275
- inputs = [input_text, description]
276
- outputs = [audio_out]
277
- gr.Examples(examples=examples, fn=generate_tts, inputs=inputs, outputs=outputs, cache_examples=False)
278
- run_button.click(fn=generate_tts, inputs=inputs, outputs=outputs, queue=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  gr.HTML(
280
  """
281
  <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
 
16
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
17
 
18
  repo_id = "parler-tts/parler_tts_mini_v0.1"
19
+ jenny_repo_id = "ylacombe/parler-tts-mini-jenny-30H"
20
 
21
  model = ParlerTTSForConditionalGeneration.from_pretrained(
22
  repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
23
  ).to(device)
24
+ jenny_model = ParlerTTSForConditionalGeneration.from_pretrained(
25
+ jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
26
+ ).to(device)
27
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
28
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
29
 
 
50
  ],
51
  ]
52
 
53
+ jenny_examples = [
54
+ [
55
+ "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
56
+ "Jenny speaks at a fast pace in a small, confined space with a very clear audio and an animated tone."
57
+ ],
58
+ [
59
+ "'This is the best time of my life, Bartley,' she said happily.",
60
+ "Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
61
+ ],
62
+ [
63
+ "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
64
+ "Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
65
+ ],
66
+ [
67
+ "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
68
+ "Jenny delivers words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
69
+ ],
70
+ ]
71
+
72
  class ParlerTTSStreamer(BaseStreamer):
73
  def __init__(
74
  self,
 
194
  max_range = np.iinfo(target_dtype).max
195
 
196
  @spaces.GPU
197
+ def generate_base(text, description, play_steps_in_s=2.0):
198
+ play_steps = int(frame_rate * play_steps_in_s)
199
+ streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
200
+
201
+ inputs = tokenizer(description, return_tensors="pt").to(device)
202
+ prompt = tokenizer(text, return_tensors="pt").to(device)
203
+
204
+ generation_kwargs = dict(
205
+ input_ids=inputs.input_ids,
206
+ prompt_input_ids=prompt.input_ids,
207
+ streamer=streamer,
208
+ do_sample=True,
209
+ temperature=1.0,
210
+ min_new_tokens=10,
211
+ )
212
+
213
+ set_seed(SEED)
214
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
215
+ thread.start()
216
+
217
+ for new_audio in streamer:
218
+ print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
219
+ new_audio = (new_audio * max_range).astype(np.int16)
220
+ yield sampling_rate, new_audio
221
+
222
+ @spaces.GPU
223
+ def generate_jenny(text, description, play_steps_in_s=2.0):
224
  play_steps = int(frame_rate * play_steps_in_s)
225
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
226
 
 
245
  new_audio = (new_audio * max_range).astype(np.int16)
246
  yield sampling_rate, new_audio
247
 
248
+
249
  css = """
250
  #share-btn-container {
251
  display: flex;
 
314
  </p>
315
  """
316
  )
317
+ with gr.Tab("Base"):
318
+ with gr.Row():
319
+ with gr.Column():
320
+ input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
321
+ description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
322
+ play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
323
+ run_button = gr.Button("Generate Audio", variant="primary")
324
+ with gr.Column():
325
+ audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
326
+
327
+ inputs = [input_text, description, play_seconds]
328
+ outputs = [audio_out]
329
+ gr.Examples(examples=examples, fn=generate_base, inputs=inputs, outputs=outputs, cache_examples=False)
330
+ run_button.click(fn=generate_base, inputs=inputs, outputs=outputs, queue=True)
331
+ with gr.Tab("Jenny"):
332
+ with gr.Row():
333
+ with gr.Column():
334
+ input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
335
+ description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
336
+ play_seconds = gr.Slider(2.5, 5.0, value=2.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
337
+ run_button = gr.Button("Generate Audio", variant="primary")
338
+ with gr.Column():
339
+ audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True,
340
+ autoplay=True)
341
+
342
+ inputs = [input_text, description, play_seconds]
343
+ outputs = [audio_out]
344
+ gr.Examples(examples=examples, fn=generate_jenny, inputs=inputs, outputs=outputs, cache_examples=False)
345
+ run_button.click(fn=generate_jenny, inputs=inputs, outputs=outputs, queue=True)
346
+
347
  gr.HTML(
348
  """
349
  <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.