Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -5,8 +5,8 @@ from share_btn import community_icon_html, loading_icon_html, share_js
|
|
5 |
|
6 |
model_id="haoheliu/AudioLDM-S-Full"
|
7 |
|
8 |
-
audioldm =
|
9 |
-
|
10 |
|
11 |
# def predict(input, history=[]):
|
12 |
# # tokenize the new input sentence
|
@@ -23,10 +23,25 @@ audioldm = build_model()
|
|
23 |
# response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
|
24 |
# return response, history
|
25 |
|
26 |
-
def text2audio(text, duration, guidance_scale, random_seed, n_candidates):
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
# print(text, length, guidance_scale)
|
28 |
-
waveform = text_to_audio(
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
|
31 |
if(len(waveform) == 1):
|
32 |
waveform = waveform[0]
|
@@ -223,6 +238,9 @@ with iface:
|
|
223 |
duration = gr.Slider(2.5, 10, value=10, step=2.5, label="Duration (seconds)")
|
224 |
guidance_scale = gr.Slider(0, 4, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
|
225 |
n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
|
|
|
|
|
|
|
226 |
############# Output
|
227 |
# outputs=gr.Audio(label="Output", type="numpy")
|
228 |
outputs=gr.Video(label="Output", elem_id="output-video")
|
@@ -242,7 +260,7 @@ with iface:
|
|
242 |
share_button = gr.Button("Share to community", elem_id="share-btn")
|
243 |
|
244 |
btn.click(text2audio, inputs=[
|
245 |
-
textbox, duration, guidance_scale, seed, n_candidates], outputs=[outputs])
|
246 |
|
247 |
share_button.click(None, [], [], _js=share_js)
|
248 |
gr.HTML('''
|
@@ -255,14 +273,14 @@ with iface:
|
|
255 |
</div>
|
256 |
''')
|
257 |
gr.Examples([
|
258 |
-
["A hammer is hitting a wooden surface", 5, 2.5, 45, 3],
|
259 |
-
["Peaceful and calming ambient music with singing bowl and other instruments.", 5, 2.5, 45, 3],
|
260 |
-
["A man is speaking in a small room.", 5, 2.5, 45, 3],
|
261 |
-
["A female is speaking followed by footstep sound", 5, 2.5, 45, 3],
|
262 |
-
["Wooden table tapping sound followed by water pouring sound.", 5, 2.5, 45, 3],
|
263 |
],
|
264 |
fn=text2audio,
|
265 |
-
inputs=[textbox, duration, guidance_scale, seed, n_candidates],
|
266 |
outputs=[outputs],
|
267 |
cache_examples=True,
|
268 |
)
|
|
|
5 |
|
6 |
model_id="haoheliu/AudioLDM-S-Full"
|
7 |
|
8 |
+
audioldm = None
|
9 |
+
current_model_name = None
|
10 |
|
11 |
# def predict(input, history=[]):
|
12 |
# # tokenize the new input sentence
|
|
|
23 |
# response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
|
24 |
# return response, history
|
25 |
|
26 |
+
def text2audio(text, duration, guidance_scale, random_seed, n_candidates, model_name):
|
27 |
+
global audioldm, current_model_name
|
28 |
+
|
29 |
+
if audioldm is None or model_name != current_model_name:
|
30 |
+
audioldm=build_model(model_name=model_name)
|
31 |
+
current_model_name = model_name
|
32 |
+
|
33 |
# print(text, length, guidance_scale)
|
34 |
+
waveform = text_to_audio(
|
35 |
+
latent_diffusion=audioldm,
|
36 |
+
text=text,
|
37 |
+
seed=random_seed,
|
38 |
+
duration=duration,
|
39 |
+
guidance_scale=guidance_scale,
|
40 |
+
n_candidate_gen_per_text=int(n_candidates),
|
41 |
+
) # [bs, 1, samples]
|
42 |
+
waveform = [
|
43 |
+
gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform
|
44 |
+
]
|
45 |
# waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
|
46 |
if(len(waveform) == 1):
|
47 |
waveform = waveform[0]
|
|
|
238 |
duration = gr.Slider(2.5, 10, value=10, step=2.5, label="Duration (seconds)")
|
239 |
guidance_scale = gr.Slider(0, 4, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
|
240 |
n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
|
241 |
+
model_name = gr.Dropdown(
|
242 |
+
["audioldm-m-text-ft", "audioldm-s-text-ft", "audioldm-m-full","audioldm-s-full-v2", "audioldm-s-full", "audioldm-l-full"], value="audioldm-m-text-ft", label="Choose the model to use. audioldm-m-text-ft and audioldm-s-text-ft are recommanded. -s- means small, -m- means medium and -l- means large",
|
243 |
+
)
|
244 |
############# Output
|
245 |
# outputs=gr.Audio(label="Output", type="numpy")
|
246 |
outputs=gr.Video(label="Output", elem_id="output-video")
|
|
|
260 |
share_button = gr.Button("Share to community", elem_id="share-btn")
|
261 |
|
262 |
btn.click(text2audio, inputs=[
|
263 |
+
textbox, duration, guidance_scale, seed, n_candidates, model_name], outputs=[outputs])
|
264 |
|
265 |
share_button.click(None, [], [], _js=share_js)
|
266 |
gr.HTML('''
|
|
|
273 |
</div>
|
274 |
''')
|
275 |
gr.Examples([
|
276 |
+
["A hammer is hitting a wooden surface", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
|
277 |
+
["Peaceful and calming ambient music with singing bowl and other instruments.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
|
278 |
+
["A man is speaking in a small room.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
|
279 |
+
["A female is speaking followed by footstep sound", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
|
280 |
+
["Wooden table tapping sound followed by water pouring sound.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
|
281 |
],
|
282 |
fn=text2audio,
|
283 |
+
inputs=[textbox, duration, guidance_scale, seed, n_candidates, model_name],
|
284 |
outputs=[outputs],
|
285 |
cache_examples=True,
|
286 |
)
|