|
import gradio as gr |
|
import os |
|
hf_token = os.environ.get('HF_TOKEN') |
|
|
|
lpmc_client = gr.load("fffiloni/LP-Music-Caps-demo", src="spaces", hf_token=hf_token) |
|
|
|
from gradio_client import Client |
|
|
|
client = Client("https://fffiloni-test-llama-api-debug.hf.space/", hf_token=hf_token) |
|
zrscp_client = Client("https://fffiloni-zeroscope.hf.space/", hf_token=hf_token) |
|
|
|
from pydub import AudioSegment |
|
|
|
def cut_audio(input_path, output_path, max_duration=30000): |
|
audio = AudioSegment.from_file(input_path) |
|
|
|
if len(audio) > max_duration: |
|
audio = audio[:max_duration] |
|
|
|
audio.export(output_path, format="mp3") |
|
|
|
return output_path |
|
|
|
def solo_zrscp(prompt): |
|
res_vid = zrscp_client.predict( |
|
prompt, |
|
api_name="/zrscp" |
|
) |
|
return res_vid |
|
|
|
def infer(audio_file): |
|
|
|
truncated_audio = cut_audio(audio_file, "trunc_audio.mp3") |
|
|
|
cap_result = lpmc_client( |
|
truncated_audio, |
|
api_name="predict" |
|
) |
|
print(cap_result) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llama_q = f""" |
|
I'll give you a music description. |
|
Give me an image description that would fit well with the music description. |
|
Be creative, do not do list, just an image description as required. Try to think about human characters first. |
|
Your image description must fit well for a stable diffusion prompt. |
|
|
|
Here's the music description : |
|
|
|
« {cap_result} » |
|
""" |
|
|
|
result = client.predict( |
|
llama_q, |
|
"M2I", |
|
api_name="/predict" |
|
) |
|
|
|
|
|
|
|
|
|
print(f"Llama2 result: {result}") |
|
|
|
res_vid = zrscp_client.predict( |
|
result, |
|
api_name="/zrscp" |
|
) |
|
|
|
print("Finished") |
|
|
|
|
|
return res_vid, result, gr.update(visible=True) |
|
|
|
css = """ |
|
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;} |
|
""" |
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 0.8rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;"> |
|
Music To Zeroscope Video |
|
</h1> |
|
</div> |
|
<p style="margin-bottom: 10px; font-size: 94%"> |
|
Sends an audio into <a href="https://huggingface.co/spaces/seungheondoh/LP-Music-Caps-demo" target="_blank">LP-Music-Caps</a> |
|
to generate a audio caption which is then translated to an illustrative image description with Llama2, and finally run through |
|
Zeroscope to generate a 3s video from the audio ! <br /><br /> |
|
Note: Only the first 30 seconds of your audio will be used for inference. |
|
</p> |
|
</div>""") |
|
audio_input = gr.Audio(label="Music input", type="filepath", source="upload") |
|
infer_btn = gr.Button("Generate Video from Music") |
|
|
|
llama_trans_cap = gr.Textbox(label="Llama translation", visible=False) |
|
vid_result = gr.Video(label="Image Result") |
|
tryagain_btn = gr.Button("Try again ?", visible=False) |
|
|
|
gr.Examples(examples=[["./examples/electronic.mp3"],["./examples/folk.wav"], ["./examples/orchestra.wav"]], |
|
fn=infer, |
|
inputs=[audio_input], |
|
outputs=[vid_result, llama_trans_cap, tryagain_btn], |
|
cache_examples=True |
|
) |
|
|
|
|
|
infer_btn.click(fn=infer, inputs=[audio_input], outputs=[vid_result, llama_trans_cap, tryagain_btn]) |
|
tryagain_btn.click(fn=solo_zrscp, inputs=[llama_trans_cap], outputs=[vid_result]) |
|
|
|
demo.queue(max_size=20).launch() |