freddyaboulton HF staff commited on
Commit
ee4aecd
1 Parent(s): 72c65b6
Files changed (1) hide show
  1. app.py +22 -23
app.py CHANGED
@@ -29,10 +29,6 @@ model = ParlerTTSForConditionalGeneration.from_pretrained(
29
  jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
30
  ).to(device)
31
 
32
- model = ParlerTTSForConditionalGeneration.from_pretrained(
33
- repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
34
- ).to(device)
35
-
36
  client = InferenceClient()
37
 
38
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
@@ -213,6 +209,7 @@ def generate_base(subject, setting, ):
213
  response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
214
  gr.Info("Story Generated", duration=3)
215
  story = response.choices[0].message.content
 
216
 
217
  model_input = story.replace("\n", " ").strip()
218
  model_input = nltk.sent_tokenize(model_input)
@@ -221,29 +218,31 @@ def generate_base(subject, setting, ):
221
  play_steps = int(frame_rate * play_steps_in_s)
222
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
223
 
224
- description = "A female speaker with a calm, warm, monotone voice delivers her words at a normal pace confined space with very clear audio."
225
  inputs = tokenizer(description, return_tensors="pt").to(device)
226
- prompt = tokenizer(story, return_tensors="pt").to(device)
227
-
228
- generation_kwargs = dict(
229
- input_ids=inputs.input_ids,
230
- prompt_input_ids=prompt.input_ids,
231
- streamer=streamer,
232
- do_sample=True,
233
- temperature=1.0,
234
- min_new_tokens=10,
235
- )
236
 
237
- set_seed(SEED)
238
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
239
- thread.start()
240
 
241
- yield story, None
242
 
243
- gr.Info("Reading story", duration=3)
244
- for new_audio in streamer:
245
- print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
246
- yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
 
249
  with gr.Blocks() as block:
 
29
  jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
30
  ).to(device)
31
 
 
 
 
 
32
  client = InferenceClient()
33
 
34
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
 
209
  response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
210
  gr.Info("Story Generated", duration=3)
211
  story = response.choices[0].message.content
212
+ yield story, None
213
 
214
  model_input = story.replace("\n", " ").strip()
215
  model_input = nltk.sent_tokenize(model_input)
 
218
  play_steps = int(frame_rate * play_steps_in_s)
219
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
220
 
221
+ description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
222
  inputs = tokenizer(description, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
223
 
224
+ gr.Info("Reading story", duration=3)
225
+
226
+ for sentence in model_input:
227
 
228
+ prompt = tokenizer(sentence, return_tensors="pt").to(device)
229
 
230
+ generation_kwargs = dict(
231
+ input_ids=inputs.input_ids,
232
+ prompt_input_ids=prompt.input_ids,
233
+ streamer=streamer,
234
+ do_sample=True,
235
+ temperature=1.0,
236
+ min_new_tokens=10,
237
+ )
238
+
239
+ set_seed(SEED)
240
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
241
+ thread.start()
242
+
243
+ for new_audio in streamer:
244
+ print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
245
+ yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
246
 
247
 
248
  with gr.Blocks() as block: