DGSpitzer commited on
Commit
c170719
1 Parent(s): f25290d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -19
app.py CHANGED
@@ -24,6 +24,7 @@ import mutagen
24
  from mutagen.mp3 import MP3
25
 
26
  img_to_text = gr.Blocks.load(name="spaces/pharma/CLIP-Interrogator")
 
27
 
28
  language_translation_model = hub.Module(name='baidu_translate')
29
  language_recognition_model = hub.Module(name='baidu_language_recognition')
@@ -71,7 +72,7 @@ def translate_language(text_prompts):
71
 
72
 
73
 
74
- def get_result(text_prompts, style_indx):
75
  style = style_list[style_indx]
76
  prompt = style + "," + text_prompts
77
 
@@ -90,25 +91,53 @@ def get_result(text_prompts, style_indx):
90
 
91
  interrogate_prompt = img_to_text(imagefile, "ViT-L (best for Stable Diffusion 1.*)", "fast", fn_index=1)[0]
92
  print(interrogate_prompt)
93
- spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx])
94
 
95
  video_merged = merge_video(music_output, image_output)
96
  return {spec_result:spec_image, video_result:video_merged, status_text:'Success'}
97
 
98
- def get_music(prompt):
99
- spec = pipe2(prompt).images[0]
100
- print(spec)
101
- wav = wav_bytes_from_spectrogram_image(spec)
102
- with open("output.wav", "wb") as f:
103
- f.write(wav[0].getbuffer())
104
- return spec, 'output.wav'
105
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- def merge_video(music, image):
108
- #Convert to mp3
109
- mp3file_name = "audio.mp3"
110
- wavfile = AudioSegment.from_wav(music)
111
- wavfile.export(mp3file_name, format="mp3")
112
  print('wav audio converted to mp3 audio' )
113
  print('now getting duration of this mp3 audio' )
114
  #getting audio clip's duration
@@ -252,7 +281,7 @@ examples = [
252
  '概念艺术(Conceptual Art)'
253
  ],
254
  [
255
- '嫦娥在时代广场,戏曲',
256
  '写实风格(Realistic style)'
257
  ],
258
  [
@@ -395,7 +424,7 @@ with block:
395
  <h1 style="font-weight: 900; margin-bottom: 7px;">Text to Image to Music to Video</h1>
396
  </div>
397
  <p style="margin-bottom: 10px; font-size: 94%">
398
- Powered by <a href="https://huggingface.co/riffusion/riffusion-model-v1" target="_blank">Riffusion Model V1</a>, <a href="https://huggingface.co/spaces/runwayml/stable-diffusion-v1-5" target="_blank">Stable Diffusion V1.5</a>, <a href="https://huggingface.co/spaces/pharma/CLIP-Interrogator" target="_blank">CLIP Interrogator</a>, fffiloni's <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">Riffusion Text-to-Music</a> and Baidu Language Translation projects
399
  </p>
400
  </div>
401
  """
@@ -423,6 +452,7 @@ with block:
423
  '卡通(Cartoon)', '二次元(Anime)', '浮世绘(Ukiyoe)', '蒸汽波艺术(Vaporwave)', 'low poly',
424
  '像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)',
425
  '洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index")
 
426
  status_text = gr.Textbox(
427
  label="处理状态(Process status)",
428
  show_label=True,
@@ -430,8 +460,8 @@ with block:
430
  interactive=False
431
  )
432
 
433
- spec_result = gr.Image()
434
  video_result = gr.Video(type=None, label='Final Merged video')
 
435
 
436
  trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion.
437
  translated_language = gr.Textbox(vaule="", visible=False)
@@ -443,7 +473,7 @@ with block:
443
 
444
  text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
445
  btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
446
- trigger_component.change(fn=get_result, inputs=[translated_language, styles], outputs=[spec_result, video_result, status_text])
447
 
448
 
449
  gr.Markdown(
 
24
  from mutagen.mp3 import MP3
25
 
26
  img_to_text = gr.Blocks.load(name="spaces/pharma/CLIP-Interrogator")
27
+ text_to_music = gr.Interface.load("spaces/fffiloni/text-2-music")
28
 
29
  language_translation_model = hub.Module(name='baidu_translate')
30
  language_recognition_model = hub.Module(name='baidu_language_recognition')
 
72
 
73
 
74
 
75
+ def get_result(text_prompts, style_indx, musicAI_indx):
76
  style = style_list[style_indx]
77
  prompt = style + "," + text_prompts
78
 
 
91
 
92
  interrogate_prompt = img_to_text(imagefile, "ViT-L (best for Stable Diffusion 1.*)", "fast", fn_index=1)[0]
93
  print(interrogate_prompt)
94
+ spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx], musicAI_indx)
95
 
96
  video_merged = merge_video(music_output, image_output)
97
  return {spec_result:spec_image, video_result:video_merged, status_text:'Success'}
98
 
99
+
100
+ def get_music(prompt, musicAI_indx):
101
+ if musicAI_indx == 0:
102
+ spec = pipe2(prompt).images[0]
103
+ print(spec)
104
+ wav = wav_bytes_from_spectrogram_image(spec)
105
+ with open("output.wav", "wb") as f:
106
+ f.write(wav[0].getbuffer())
107
+
108
+
109
+ #Convert to mp3
110
+ mp3file_name = "audio.mp3"
111
+ #wavfile = AudioSegment.from_wav(wavfilename)
112
+ wav.export(mp3file_name, format="mp3")
113
+ return spec, mp3file_name
114
+ else:
115
+ result = text_to_music(prompt, fn_index=0)
116
+
117
+ print(f"""—————
118
+ NEW RESULTS
119
+ prompt : {prompt}
120
+ music : {result}
121
+ ———————
122
+ """)
123
+
124
+ url = result
125
+ mp3file_name = "file.mp3"
126
+
127
+ data = urllib.request.urlopen(url)
128
+
129
+ f = open(mp3file_name,'wb')
130
+ f.write(data.read())
131
+ f.close()
132
+
133
+ #wave_file="file.wav"
134
+
135
+ #sound = AudioSegment.from_mp3(save_as)
136
+ #sound.export(wave_file, format="wav")
137
+ return None, mp3file_name
138
+
139
 
140
+ def merge_video(mp3file_name, image):
 
 
 
 
141
  print('wav audio converted to mp3 audio' )
142
  print('now getting duration of this mp3 audio' )
143
  #getting audio clip's duration
 
281
  '概念艺术(Conceptual Art)'
282
  ],
283
  [
284
+ '少女在时代广场,舞蹈',
285
  '写实风格(Realistic style)'
286
  ],
287
  [
 
424
  <h1 style="font-weight: 900; margin-bottom: 7px;">Text to Image to Music to Video</h1>
425
  </div>
426
  <p style="margin-bottom: 10px; font-size: 94%">
427
+ Powered by <a href="https://huggingface.co/riffusion/riffusion-model-v1" target="_blank">Riffusion Model V1</a>, <a href="https://huggingface.co/spaces/Mubert/Text-to-Music" target="_blank">Mubert AI</a>, <a href="https://huggingface.co/spaces/runwayml/stable-diffusion-v1-5" target="_blank">Stable Diffusion V1.5</a>, <a href="https://huggingface.co/spaces/pharma/CLIP-Interrogator" target="_blank">CLIP Interrogator</a>, fffiloni's <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">Riffusion Text-to-Music</a> and Baidu Language Translation projects
428
  </p>
429
  </div>
430
  """
 
452
  '卡通(Cartoon)', '二次元(Anime)', '浮世绘(Ukiyoe)', '蒸汽波艺术(Vaporwave)', 'low poly',
453
  '像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)',
454
  '洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index")
455
+ musicAI = gr.Dropdown(label="音乐生成技术(AI Music Generator)", choices=['Riffusion', 'Mubert AI'], value='Riffusion', type="index")
456
  status_text = gr.Textbox(
457
  label="处理状态(Process status)",
458
  show_label=True,
 
460
  interactive=False
461
  )
462
 
 
463
  video_result = gr.Video(type=None, label='Final Merged video')
464
+ spec_result = gr.Image()
465
 
466
  trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion.
467
  translated_language = gr.Textbox(vaule="", visible=False)
 
473
 
474
  text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
475
  btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
476
+ trigger_component.change(fn=get_result, inputs=[translated_language, styles, musicAI], outputs=[spec_result, video_result, status_text])
477
 
478
 
479
  gr.Markdown(