Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -24,6 +24,7 @@ import mutagen
|
|
24 |
from mutagen.mp3 import MP3
|
25 |
|
26 |
img_to_text = gr.Blocks.load(name="spaces/pharma/CLIP-Interrogator")
|
|
|
27 |
|
28 |
language_translation_model = hub.Module(name='baidu_translate')
|
29 |
language_recognition_model = hub.Module(name='baidu_language_recognition')
|
@@ -71,7 +72,7 @@ def translate_language(text_prompts):
|
|
71 |
|
72 |
|
73 |
|
74 |
-
def get_result(text_prompts, style_indx):
|
75 |
style = style_list[style_indx]
|
76 |
prompt = style + "," + text_prompts
|
77 |
|
@@ -90,25 +91,53 @@ def get_result(text_prompts, style_indx):
|
|
90 |
|
91 |
interrogate_prompt = img_to_text(imagefile, "ViT-L (best for Stable Diffusion 1.*)", "fast", fn_index=1)[0]
|
92 |
print(interrogate_prompt)
|
93 |
-
spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx])
|
94 |
|
95 |
video_merged = merge_video(music_output, image_output)
|
96 |
return {spec_result:spec_image, video_result:video_merged, status_text:'Success'}
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
def merge_video(
|
108 |
-
#Convert to mp3
|
109 |
-
mp3file_name = "audio.mp3"
|
110 |
-
wavfile = AudioSegment.from_wav(music)
|
111 |
-
wavfile.export(mp3file_name, format="mp3")
|
112 |
print('wav audio converted to mp3 audio' )
|
113 |
print('now getting duration of this mp3 audio' )
|
114 |
#getting audio clip's duration
|
@@ -252,7 +281,7 @@ examples = [
|
|
252 |
'概念艺术(Conceptual Art)'
|
253 |
],
|
254 |
[
|
255 |
-
'
|
256 |
'写实风格(Realistic style)'
|
257 |
],
|
258 |
[
|
@@ -395,7 +424,7 @@ with block:
|
|
395 |
<h1 style="font-weight: 900; margin-bottom: 7px;">Text to Image to Music to Video</h1>
|
396 |
</div>
|
397 |
<p style="margin-bottom: 10px; font-size: 94%">
|
398 |
-
Powered by <a href="https://huggingface.co/riffusion/riffusion-model-v1" target="_blank">Riffusion Model V1</a>, <a href="https://huggingface.co/spaces/runwayml/stable-diffusion-v1-5" target="_blank">Stable Diffusion V1.5</a>, <a href="https://huggingface.co/spaces/pharma/CLIP-Interrogator" target="_blank">CLIP Interrogator</a>, fffiloni's <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">Riffusion Text-to-Music</a> and Baidu Language Translation projects
|
399 |
</p>
|
400 |
</div>
|
401 |
"""
|
@@ -423,6 +452,7 @@ with block:
|
|
423 |
'卡通(Cartoon)', '二次元(Anime)', '浮世绘(Ukiyoe)', '蒸汽波艺术(Vaporwave)', 'low poly',
|
424 |
'像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)',
|
425 |
'洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index")
|
|
|
426 |
status_text = gr.Textbox(
|
427 |
label="处理状态(Process status)",
|
428 |
show_label=True,
|
@@ -430,8 +460,8 @@ with block:
|
|
430 |
interactive=False
|
431 |
)
|
432 |
|
433 |
-
spec_result = gr.Image()
|
434 |
video_result = gr.Video(type=None, label='Final Merged video')
|
|
|
435 |
|
436 |
trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion.
|
437 |
translated_language = gr.Textbox(vaule="", visible=False)
|
@@ -443,7 +473,7 @@ with block:
|
|
443 |
|
444 |
text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
|
445 |
btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
|
446 |
-
trigger_component.change(fn=get_result, inputs=[translated_language, styles], outputs=[spec_result, video_result, status_text])
|
447 |
|
448 |
|
449 |
gr.Markdown(
|
|
|
24 |
from mutagen.mp3 import MP3
|
25 |
|
26 |
img_to_text = gr.Blocks.load(name="spaces/pharma/CLIP-Interrogator")
|
27 |
+
text_to_music = gr.Interface.load("spaces/fffiloni/text-2-music")
|
28 |
|
29 |
language_translation_model = hub.Module(name='baidu_translate')
|
30 |
language_recognition_model = hub.Module(name='baidu_language_recognition')
|
|
|
72 |
|
73 |
|
74 |
|
75 |
+
def get_result(text_prompts, style_indx, musicAI_indx):
|
76 |
style = style_list[style_indx]
|
77 |
prompt = style + "," + text_prompts
|
78 |
|
|
|
91 |
|
92 |
interrogate_prompt = img_to_text(imagefile, "ViT-L (best for Stable Diffusion 1.*)", "fast", fn_index=1)[0]
|
93 |
print(interrogate_prompt)
|
94 |
+
spec_image, music_output = get_music(interrogate_prompt + ", " + style_list_EN[style_indx], musicAI_indx)
|
95 |
|
96 |
video_merged = merge_video(music_output, image_output)
|
97 |
return {spec_result:spec_image, video_result:video_merged, status_text:'Success'}
|
98 |
|
99 |
+
|
100 |
+
def get_music(prompt, musicAI_indx):
|
101 |
+
if musicAI_indx == 0:
|
102 |
+
spec = pipe2(prompt).images[0]
|
103 |
+
print(spec)
|
104 |
+
wav = wav_bytes_from_spectrogram_image(spec)
|
105 |
+
with open("output.wav", "wb") as f:
|
106 |
+
f.write(wav[0].getbuffer())
|
107 |
+
|
108 |
+
|
109 |
+
#Convert to mp3
|
110 |
+
mp3file_name = "audio.mp3"
|
111 |
+
#wavfile = AudioSegment.from_wav(wavfilename)
|
112 |
+
wav.export(mp3file_name, format="mp3")
|
113 |
+
return spec, mp3file_name
|
114 |
+
else:
|
115 |
+
result = text_to_music(prompt, fn_index=0)
|
116 |
+
|
117 |
+
print(f"""—————
|
118 |
+
NEW RESULTS
|
119 |
+
prompt : {prompt}
|
120 |
+
music : {result}
|
121 |
+
———————
|
122 |
+
""")
|
123 |
+
|
124 |
+
url = result
|
125 |
+
mp3file_name = "file.mp3"
|
126 |
+
|
127 |
+
data = urllib.request.urlopen(url)
|
128 |
+
|
129 |
+
f = open(mp3file_name,'wb')
|
130 |
+
f.write(data.read())
|
131 |
+
f.close()
|
132 |
+
|
133 |
+
#wave_file="file.wav"
|
134 |
+
|
135 |
+
#sound = AudioSegment.from_mp3(save_as)
|
136 |
+
#sound.export(wave_file, format="wav")
|
137 |
+
return None, mp3file_name
|
138 |
+
|
139 |
|
140 |
+
def merge_video(mp3file_name, image):
|
|
|
|
|
|
|
|
|
141 |
print('wav audio converted to mp3 audio' )
|
142 |
print('now getting duration of this mp3 audio' )
|
143 |
#getting audio clip's duration
|
|
|
281 |
'概念艺术(Conceptual Art)'
|
282 |
],
|
283 |
[
|
284 |
+
'少女在时代广场,舞蹈',
|
285 |
'写实风格(Realistic style)'
|
286 |
],
|
287 |
[
|
|
|
424 |
<h1 style="font-weight: 900; margin-bottom: 7px;">Text to Image to Music to Video</h1>
|
425 |
</div>
|
426 |
<p style="margin-bottom: 10px; font-size: 94%">
|
427 |
+
Powered by <a href="https://huggingface.co/riffusion/riffusion-model-v1" target="_blank">Riffusion Model V1</a>, <a href="https://huggingface.co/spaces/Mubert/Text-to-Music" target="_blank">Mubert AI</a>, <a href="https://huggingface.co/spaces/runwayml/stable-diffusion-v1-5" target="_blank">Stable Diffusion V1.5</a>, <a href="https://huggingface.co/spaces/pharma/CLIP-Interrogator" target="_blank">CLIP Interrogator</a>, fffiloni's <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">Riffusion Text-to-Music</a> and Baidu Language Translation projects
|
428 |
</p>
|
429 |
</div>
|
430 |
"""
|
|
|
452 |
'卡通(Cartoon)', '二次元(Anime)', '浮世绘(Ukiyoe)', '蒸汽波艺术(Vaporwave)', 'low poly',
|
453 |
'像素风格(Pixel Style)', '概念艺术(Conceptual Art)', '未来主义(Futurism)', '赛博朋克(Cyberpunk)', '写实风格(Realistic style)',
|
454 |
'洛丽塔风格(Lolita style)', '巴洛克风格(Baroque style)', '超现实主义(Surrealism)', '默认(Default)'], value='默认(Default)', type="index")
|
455 |
+
musicAI = gr.Dropdown(label="音乐生成技术(AI Music Generator)", choices=['Riffusion', 'Mubert AI'], value='Riffusion', type="index")
|
456 |
status_text = gr.Textbox(
|
457 |
label="处理状态(Process status)",
|
458 |
show_label=True,
|
|
|
460 |
interactive=False
|
461 |
)
|
462 |
|
|
|
463 |
video_result = gr.Video(type=None, label='Final Merged video')
|
464 |
+
spec_result = gr.Image()
|
465 |
|
466 |
trigger_component = gr.Textbox(vaule="", visible=False) # This component is used for triggering inference funtion.
|
467 |
translated_language = gr.Textbox(vaule="", visible=False)
|
|
|
473 |
|
474 |
text.submit(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
|
475 |
btn.click(translate_language, inputs=[text], outputs=[language_tips_text, status_text, trigger_component, translated_language])
|
476 |
+
trigger_component.change(fn=get_result, inputs=[translated_language, styles, musicAI], outputs=[spec_result, video_result, status_text])
|
477 |
|
478 |
|
479 |
gr.Markdown(
|