freddyaboulton HF staff commited on
Commit
82022e9
·
1 Parent(s): ee3a553
Files changed (1) hide show
  1. app.py +46 -15
app.py CHANGED
@@ -171,7 +171,7 @@ def numpy_to_mp3(audio_array, sampling_rate):
171
  # Normalize audio_array if it's floating-point
172
  if np.issubdtype(audio_array.dtype, np.floating):
173
  max_val = np.max(np.abs(audio_array))
174
- audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
175
  audio_array = audio_array.astype(np.int16)
176
 
177
  # Create an audio segment from the numpy array
@@ -196,9 +196,10 @@ sampling_rate = model.audio_encoder.config.sampling_rate
196
  frame_rate = model.audio_encoder.config.frame_rate
197
 
198
  import random
 
199
 
200
  @spaces.GPU
201
- def generate_base(subject, setting, ):
202
 
203
  messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
204
  "You want to write a bed time story for your child. They will give you the subject and setting "
@@ -209,34 +210,63 @@ def generate_base(subject, setting, ):
209
  response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
210
  gr.Info("Story Generated", duration=3)
211
  story = response.choices[0].message.content
212
- yield story, None
213
 
214
  model_input = story.replace("\n", " ").strip()
215
- model_input = nltk.sent_tokenize(model_input)
216
 
217
  play_steps_in_s = 4.0
218
  play_steps = int(frame_rate * play_steps_in_s)
219
 
 
220
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
221
- description = [description for _ in range(len(model_input))]
222
- description_tokens = tokenizer(description, return_tensors="pt").input_ids.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
 
224
  # for i in range(0, len(model_input), BATCH_SIZE):
225
  # inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  # if len(inputs) != 0:
228
  # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
229
- story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
230
 
231
- speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
232
 
233
- speech_output = [output.cpu().numpy() for output in speech_output]
234
 
235
- for i, new_audio in enumerate(speech_output):
236
- if i == 0:
237
- gr.Info("Reading story", duration=3)
238
- print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
239
- yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
240
 
241
  # print(f"{i}-th part generated")
242
  # pieces += [*speech_output, silence.copy()]
@@ -286,7 +316,8 @@ with gr.Blocks() as block:
286
 
287
  inputs = [subject, setting]
288
  outputs = [story, audio_out]
289
- run_button.click(fn=generate_base, inputs=inputs, outputs=outputs)
 
290
 
291
  block.queue()
292
  block.launch(share=True)
 
171
  # Normalize audio_array if it's floating-point
172
  if np.issubdtype(audio_array.dtype, np.floating):
173
  max_val = np.max(np.abs(audio_array))
174
+ audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
175
  audio_array = audio_array.astype(np.int16)
176
 
177
  # Create an audio segment from the numpy array
 
196
  frame_rate = model.audio_encoder.config.frame_rate
197
 
198
  import random
199
+ import datetime
200
 
201
  @spaces.GPU
202
+ def generate_base(subject, setting):
203
 
204
  messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
205
  "You want to write a bed time story for your child. They will give you the subject and setting "
 
210
  response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
211
  gr.Info("Story Generated", duration=3)
212
  story = response.choices[0].message.content
 
213
 
214
  model_input = story.replace("\n", " ").strip()
215
+ model_input_tokens = nltk.sent_tokenize(model_input)
216
 
217
  play_steps_in_s = 4.0
218
  play_steps = int(frame_rate * play_steps_in_s)
219
 
220
+ gr.Info("Generating Audio")
221
  description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
222
+ story_tokens = tokenizer(model_input_tokens, return_tensors="pt", padding=True).input_ids.to(device)
223
+ description_tokens = tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").input_ids.to(device)
224
+ speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
225
+ speech_output = [output.cpu().numpy() for output in speech_output]
226
+ gr.Info("Generated Audio")
227
+ return None, None, {"audio": speech_output, "text": model_input}
228
+
229
+ def stream_audio(state):
230
+ speech_output = state["audio"]
231
+ sentences = state["text"]
232
+
233
+ gr.Info("Reading Story")
234
+
235
+ story = ""
236
+ for sentence, new_audio in zip(sentences, speech_output):
237
+ # print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
238
+ print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
239
+ story += f"{sentence}\n"
240
+ yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
241
 
242
+ # BATCH_SIZE = 4
243
  # for i in range(0, len(model_input), BATCH_SIZE):
244
  # inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
245
+ # story_tokens = tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to(device)
246
+ # description_tokens = tokenizer([description for _ in range(len(inputs))], return_tensors="pt").input_ids.to(device)
247
+ # speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
248
+
249
+ # speech_output = [output.cpu().numpy() for output in speech_output]
250
+ # for j, new_audio in enumerate(speech_output):
251
+ # if i + j == 0:
252
+ # gr.Info("Reading story", duration=3)
253
+ # print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
254
+ # print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
255
+ # yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
256
 
257
  # if len(inputs) != 0:
258
  # input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
259
+ # story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
260
 
261
+ # speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
262
 
263
+ # speech_output = [output.cpu().numpy() for output in speech_output]
264
 
265
+ # for i, new_audio in enumerate(speech_output):
266
+ # if i == 0:
267
+ # gr.Info("Reading story", duration=3)
268
+ # print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
269
+ # yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
270
 
271
  # print(f"{i}-th part generated")
272
  # pieces += [*speech_output, silence.copy()]
 
316
 
317
  inputs = [subject, setting]
318
  outputs = [story, audio_out]
319
+ state = gr.State()
320
+ run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
321
 
322
  block.queue()
323
  block.launch(share=True)