Spaces:
Sleeping
Sleeping
Commit
·
82022e9
1
Parent(s):
ee3a553
Add
Browse files
app.py
CHANGED
@@ -171,7 +171,7 @@ def numpy_to_mp3(audio_array, sampling_rate):
|
|
171 |
# Normalize audio_array if it's floating-point
|
172 |
if np.issubdtype(audio_array.dtype, np.floating):
|
173 |
max_val = np.max(np.abs(audio_array))
|
174 |
-
audio_array = (audio_array / max_val) * 32767
|
175 |
audio_array = audio_array.astype(np.int16)
|
176 |
|
177 |
# Create an audio segment from the numpy array
|
@@ -196,9 +196,10 @@ sampling_rate = model.audio_encoder.config.sampling_rate
|
|
196 |
frame_rate = model.audio_encoder.config.frame_rate
|
197 |
|
198 |
import random
|
|
|
199 |
|
200 |
@spaces.GPU
|
201 |
-
def generate_base(subject, setting
|
202 |
|
203 |
messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
|
204 |
"You want to write a bed time story for your child. They will give you the subject and setting "
|
@@ -209,34 +210,63 @@ def generate_base(subject, setting, ):
|
|
209 |
response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
|
210 |
gr.Info("Story Generated", duration=3)
|
211 |
story = response.choices[0].message.content
|
212 |
-
yield story, None
|
213 |
|
214 |
model_input = story.replace("\n", " ").strip()
|
215 |
-
|
216 |
|
217 |
play_steps_in_s = 4.0
|
218 |
play_steps = int(frame_rate * play_steps_in_s)
|
219 |
|
|
|
220 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
221 |
-
|
222 |
-
description_tokens = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
|
|
|
224 |
# for i in range(0, len(model_input), BATCH_SIZE):
|
225 |
# inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
# if len(inputs) != 0:
|
228 |
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
229 |
-
story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
|
230 |
|
231 |
-
speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
|
232 |
|
233 |
-
speech_output = [output.cpu().numpy() for output in speech_output]
|
234 |
|
235 |
-
for i, new_audio in enumerate(speech_output):
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
|
241 |
# print(f"{i}-th part generated")
|
242 |
# pieces += [*speech_output, silence.copy()]
|
@@ -286,7 +316,8 @@ with gr.Blocks() as block:
|
|
286 |
|
287 |
inputs = [subject, setting]
|
288 |
outputs = [story, audio_out]
|
289 |
-
|
|
|
290 |
|
291 |
block.queue()
|
292 |
block.launch(share=True)
|
|
|
171 |
# Normalize audio_array if it's floating-point
|
172 |
if np.issubdtype(audio_array.dtype, np.floating):
|
173 |
max_val = np.max(np.abs(audio_array))
|
174 |
+
audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
|
175 |
audio_array = audio_array.astype(np.int16)
|
176 |
|
177 |
# Create an audio segment from the numpy array
|
|
|
196 |
frame_rate = model.audio_encoder.config.frame_rate
|
197 |
|
198 |
import random
|
199 |
+
import datetime
|
200 |
|
201 |
@spaces.GPU
|
202 |
+
def generate_base(subject, setting):
|
203 |
|
204 |
messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
|
205 |
"You want to write a bed time story for your child. They will give you the subject and setting "
|
|
|
210 |
response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
|
211 |
gr.Info("Story Generated", duration=3)
|
212 |
story = response.choices[0].message.content
|
|
|
213 |
|
214 |
model_input = story.replace("\n", " ").strip()
|
215 |
+
model_input_tokens = nltk.sent_tokenize(model_input)
|
216 |
|
217 |
play_steps_in_s = 4.0
|
218 |
play_steps = int(frame_rate * play_steps_in_s)
|
219 |
|
220 |
+
gr.Info("Generating Audio")
|
221 |
description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
|
222 |
+
story_tokens = tokenizer(model_input_tokens, return_tensors="pt", padding=True).input_ids.to(device)
|
223 |
+
description_tokens = tokenizer([description for _ in range(len(model_input_tokens))], return_tensors="pt").input_ids.to(device)
|
224 |
+
speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
|
225 |
+
speech_output = [output.cpu().numpy() for output in speech_output]
|
226 |
+
gr.Info("Generated Audio")
|
227 |
+
return None, None, {"audio": speech_output, "text": model_input}
|
228 |
+
|
229 |
+
def stream_audio(state):
|
230 |
+
speech_output = state["audio"]
|
231 |
+
sentences = state["text"]
|
232 |
+
|
233 |
+
gr.Info("Reading Story")
|
234 |
+
|
235 |
+
story = ""
|
236 |
+
for sentence, new_audio in zip(sentences, speech_output):
|
237 |
+
# print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
|
238 |
+
print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
239 |
+
story += f"{sentence}\n"
|
240 |
+
yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
241 |
|
242 |
+
# BATCH_SIZE = 4
|
243 |
# for i in range(0, len(model_input), BATCH_SIZE):
|
244 |
# inputs = model_input[i:min(i + BATCH_SIZE, len(model_input))]
|
245 |
+
# story_tokens = tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to(device)
|
246 |
+
# description_tokens = tokenizer([description for _ in range(len(inputs))], return_tensors="pt").input_ids.to(device)
|
247 |
+
# speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story_tokens)
|
248 |
+
|
249 |
+
# speech_output = [output.cpu().numpy() for output in speech_output]
|
250 |
+
# for j, new_audio in enumerate(speech_output):
|
251 |
+
# if i + j == 0:
|
252 |
+
# gr.Info("Reading story", duration=3)
|
253 |
+
# print(f"i, j, time: {i}, {j} {datetime.datetime.now()}")
|
254 |
+
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
255 |
+
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
256 |
|
257 |
# if len(inputs) != 0:
|
258 |
# input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
259 |
+
# story = tokenizer(model_input, return_tensors="pt", padding=True).input_ids.to(device)
|
260 |
|
261 |
+
# speech_output = model.generate(input_ids=description_tokens, prompt_input_ids=story)
|
262 |
|
263 |
+
# speech_output = [output.cpu().numpy() for output in speech_output]
|
264 |
|
265 |
+
# for i, new_audio in enumerate(speech_output):
|
266 |
+
# if i == 0:
|
267 |
+
# gr.Info("Reading story", duration=3)
|
268 |
+
# print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
|
269 |
+
# yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
|
270 |
|
271 |
# print(f"{i}-th part generated")
|
272 |
# pieces += [*speech_output, silence.copy()]
|
|
|
316 |
|
317 |
inputs = [subject, setting]
|
318 |
outputs = [story, audio_out]
|
319 |
+
state = gr.State()
|
320 |
+
run_button.click(fn=generate_base, inputs=inputs, outputs=[story, audio_out, state]).success(stream_audio, inputs=state, outputs=outputs)
|
321 |
|
322 |
block.queue()
|
323 |
block.launch(share=True)
|