haoheliu commited on
Commit
b700c02
1 Parent(s): f826ca1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -12
app.py CHANGED
@@ -5,8 +5,8 @@ from share_btn import community_icon_html, loading_icon_html, share_js
5
 
6
  model_id="haoheliu/AudioLDM-S-Full"
7
 
8
- audioldm = build_model()
9
- # audioldm=None
10
 
11
  # def predict(input, history=[]):
12
  # # tokenize the new input sentence
@@ -23,10 +23,25 @@ audioldm = build_model()
23
  # response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
24
  # return response, history
25
 
26
- def text2audio(text, duration, guidance_scale, random_seed, n_candidates):
 
 
 
 
 
 
27
  # print(text, length, guidance_scale)
28
- waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
29
- waveform = [gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform]
 
 
 
 
 
 
 
 
 
30
  # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
31
  if(len(waveform) == 1):
32
  waveform = waveform[0]
@@ -223,6 +238,9 @@ with iface:
223
  duration = gr.Slider(2.5, 10, value=10, step=2.5, label="Duration (seconds)")
224
  guidance_scale = gr.Slider(0, 4, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
225
  n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
 
 
 
226
  ############# Output
227
  # outputs=gr.Audio(label="Output", type="numpy")
228
  outputs=gr.Video(label="Output", elem_id="output-video")
@@ -242,7 +260,7 @@ with iface:
242
  share_button = gr.Button("Share to community", elem_id="share-btn")
243
 
244
  btn.click(text2audio, inputs=[
245
- textbox, duration, guidance_scale, seed, n_candidates], outputs=[outputs])
246
 
247
  share_button.click(None, [], [], _js=share_js)
248
  gr.HTML('''
@@ -255,14 +273,14 @@ with iface:
255
  </div>
256
  ''')
257
  gr.Examples([
258
- ["A hammer is hitting a wooden surface", 5, 2.5, 45, 3],
259
- ["Peaceful and calming ambient music with singing bowl and other instruments.", 5, 2.5, 45, 3],
260
- ["A man is speaking in a small room.", 5, 2.5, 45, 3],
261
- ["A female is speaking followed by footstep sound", 5, 2.5, 45, 3],
262
- ["Wooden table tapping sound followed by water pouring sound.", 5, 2.5, 45, 3],
263
  ],
264
  fn=text2audio,
265
- inputs=[textbox, duration, guidance_scale, seed, n_candidates],
266
  outputs=[outputs],
267
  cache_examples=True,
268
  )
 
5
 
6
  model_id="haoheliu/AudioLDM-S-Full"
7
 
8
+ audioldm = None
9
+ current_model_name = None
10
 
11
  # def predict(input, history=[]):
12
  # # tokenize the new input sentence
 
23
  # response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
24
  # return response, history
25
 
26
+ def text2audio(text, duration, guidance_scale, random_seed, n_candidates, model_name):
27
+ global audioldm, current_model_name
28
+
29
+ if audioldm is None or model_name != current_model_name:
30
+ audioldm=build_model(model_name=model_name)
31
+ current_model_name = model_name
32
+
33
  # print(text, length, guidance_scale)
34
+ waveform = text_to_audio(
35
+ latent_diffusion=audioldm,
36
+ text=text,
37
+ seed=random_seed,
38
+ duration=duration,
39
+ guidance_scale=guidance_scale,
40
+ n_candidate_gen_per_text=int(n_candidates),
41
+ ) # [bs, 1, samples]
42
+ waveform = [
43
+ gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform
44
+ ]
45
  # waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
46
  if(len(waveform) == 1):
47
  waveform = waveform[0]
 
238
  duration = gr.Slider(2.5, 10, value=10, step=2.5, label="Duration (seconds)")
239
  guidance_scale = gr.Slider(0, 4, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
240
  n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
241
+ model_name = gr.Dropdown(
242
+ ["audioldm-m-text-ft", "audioldm-s-text-ft", "audioldm-m-full","audioldm-s-full-v2", "audioldm-s-full", "audioldm-l-full"], value="audioldm-m-text-ft", label="Choose the model to use. audioldm-m-text-ft and audioldm-s-text-ft are recommanded. -s- means small, -m- means medium and -l- means large",
243
+ )
244
  ############# Output
245
  # outputs=gr.Audio(label="Output", type="numpy")
246
  outputs=gr.Video(label="Output", elem_id="output-video")
 
260
  share_button = gr.Button("Share to community", elem_id="share-btn")
261
 
262
  btn.click(text2audio, inputs=[
263
+ textbox, duration, guidance_scale, seed, n_candidates, model_name], outputs=[outputs])
264
 
265
  share_button.click(None, [], [], _js=share_js)
266
  gr.HTML('''
 
273
  </div>
274
  ''')
275
  gr.Examples([
276
+ ["A hammer is hitting a wooden surface", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
277
+ ["Peaceful and calming ambient music with singing bowl and other instruments.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
278
+ ["A man is speaking in a small room.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
279
+ ["A female is speaking followed by footstep sound", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
280
+ ["Wooden table tapping sound followed by water pouring sound.", 5, 2.5, 45, 3, "audioldm-m-text-ft"],
281
  ],
282
  fn=text2audio,
283
+ inputs=[textbox, duration, guidance_scale, seed, n_candidates, model_name],
284
  outputs=[outputs],
285
  cache_examples=True,
286
  )