audioldm-text-to-audio-generation

Runtime error

App Files Files Community

haoheliu commited on Apr 11, 2023

Commit

b700c02

1 Parent(s): f826ca1

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -12

app.py CHANGED Viewed

@@ -5,8 +5,8 @@ from share_btn import community_icon_html, loading_icon_html, share_js
 model_id="haoheliu/AudioLDM-S-Full"
-audioldm = build_model()
-# audioldm=None
 # def predict(input, history=[]):
 #     # tokenize the new input sentence
@@ -23,10 +23,25 @@ audioldm = build_model()
 #     response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)]  # convert to tuples of list
 #     return response, history
-def text2audio(text, duration, guidance_scale, random_seed, n_candidates):
     # print(text, length, guidance_scale)
-    waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
-    waveform = [gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform]
     # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
     if(len(waveform) == 1):
       waveform = waveform[0]
@@ -223,6 +238,9 @@ with iface:
               duration = gr.Slider(2.5, 10, value=10, step=2.5, label="Duration (seconds)")
               guidance_scale = gr.Slider(0, 4, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
               n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
             ############# Output
             # outputs=gr.Audio(label="Output", type="numpy")
             outputs=gr.Video(label="Output", elem_id="output-video")
@@ -242,7 +260,7 @@ with iface:
             share_button = gr.Button("Share to community", elem_id="share-btn")
         btn.click(text2audio, inputs=[
-                  textbox, duration, guidance_scale, seed, n_candidates], outputs=[outputs])
         share_button.click(None, [], [], _js=share_js)
         gr.HTML('''
@@ -255,14 +273,14 @@ with iface:
         </div>
         ''')
         gr.Examples([
-            ["A hammer is hitting a wooden surface", 5, 2.5, 45, 3],
-            ["Peaceful and calming ambient music with singing bowl and other instruments.", 5, 2.5, 45, 3],
-            ["A man is speaking in a small room.", 5, 2.5, 45, 3],
-            ["A female is speaking followed by footstep sound", 5, 2.5, 45, 3],
-            ["Wooden table tapping sound followed by water pouring sound.", 5, 2.5, 45, 3],
         ],
             fn=text2audio,
-            inputs=[textbox, duration, guidance_scale, seed, n_candidates],
             outputs=[outputs],
             cache_examples=True,
         )

 model_id="haoheliu/AudioLDM-S-Full"
+audioldm = None
+current_model_name = None
 # def predict(input, history=[]):
 #     # tokenize the new input sentence
 #     response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)]  # convert to tuples of list
 #     return response, history
+def text2audio(text, duration, guidance_scale, random_seed, n_candidates, model_name):
+    global audioldm, current_model_name
+    if audioldm is None or model_name != current_model_name:
+        audioldm=build_model(model_name=model_name)
+        current_model_name = model_name
     # print(text, length, guidance_scale)
+    waveform = text_to_audio(
+        latent_diffusion=audioldm,
+        text=text,
+        seed=random_seed,
+        duration=duration,
+        guidance_scale=guidance_scale,
+        n_candidate_gen_per_text=int(n_candidates),
+    )  # [bs, 1, samples]
+    waveform = [
+        gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform
+    ]
     # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
     if(len(waveform) == 1):
       waveform = waveform[0]
               duration = gr.Slider(2.5, 10, value=10, step=2.5, label="Duration (seconds)")
               guidance_scale = gr.Slider(0, 4, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
               n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
+              model_name = gr.Dropdown(
+                    ["audioldm-m-text-ft", "audioldm-s-text-ft", "audioldm-m-full","audioldm-s-full-v2", "audioldm-s-full", "audioldm-l-full"], value="audioldm-m-text-ft", label="Choose the model to use. audioldm-m-text-ft and audioldm-s-text-ft are recommanded. -s- means small, -m- means medium and -l- means large",
+                )
             ############# Output
             # outputs=gr.Audio(label="Output", type="numpy")
             outputs=gr.Video(label="Output", elem_id="output-video")
             share_button = gr.Button("Share to community", elem_id="share-btn")
         btn.click(text2audio, inputs=[
+                  textbox, duration, guidance_scale, seed, n_candidates, model_name], outputs=[outputs])
         share_button.click(None, [], [], _js=share_js)
         gr.HTML('''
         </div>
         ''')
         gr.Examples([
+            ["A hammer is hitting a wooden surface", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
+            ["Peaceful and calming ambient music with singing bowl and other instruments.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
+            ["A man is speaking in a small room.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
+            ["A female is speaking followed by footstep sound", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
+            ["Wooden table tapping sound followed by water pouring sound.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
         ],
             fn=text2audio,
+            inputs=[textbox, duration, guidance_scale, seed, n_candidates, model_name],
             outputs=[outputs],
             cache_examples=True,
         )