Music-To-Zeroscope

Paused

App Files Files Community

Music-To-Zeroscope / app.py

fffiloni

update llama sys prompt

428b509 over 1 year ago

raw

history blame

4.55 kB

	import gradio as gr
	import os
	hf_token = os.environ.get('HF_TOKEN')

	lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")

	from gradio_client import Client

	client = Client("https://fffiloni-test-llama-api-debug.hf.space/", hf_token=hf_token)
	zrscp_client = Client("https://fffiloni-zeroscope--rhhd9.hf.space/", hf_token=hf_token)

	from pydub import AudioSegment

	def cut_audio(input_path, output_path, max_duration=30000):
	audio = AudioSegment.from_file(input_path)

	if len(audio) > max_duration:
	audio = audio[:max_duration]

	audio.export(output_path, format="mp3")

	return output_path

	def solo_zrscp(prompt):
	res_vid = zrscp_client.predict(
	prompt,
	api_name="/zrscp"
	)
	return res_vid

	def infer(audio_file):

	truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")

	cap_result = lpmc_client(
	truncated_audio, # str (filepath or URL to file) in 'audio_path' Audio component
	api_name="predict"
	)
	print(cap_result)

	#summarize_q = f"""

	#I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance.
	#Do not processs each segment, but provide a summary for the whole instead.

	#Here's the list:

	#{cap_result}
	#"""

	#summary_result = client.predict(
	# summarize_q, # str in 'Message' Textbox component
	# api_name="/chat_1"
	#)

	#print(f"SUMMARY: {summary_result}")

	llama_q = f"""
	I'll give you a music description.
	Give me an image description that would fit well with the music description.
	Be creative, do not do list, just an image description as required. Try to think about human characters first.
	Your image description must fit well for a stable diffusion prompt.

	Here's the music description :

	« {cap_result} »
	"""

	result = client.predict(
	llama_q, # str in 'Message' Textbox component
	"M2I",
	api_name="/predict"
	)




	print(f"Llama2 result: {result}")

	res_vid = zrscp_client.predict(
	result,
	api_name="/zrscp"
	)

	print("Finished")

	#return cap_result, result, images
	return res_vid, result, gr.update(visible=True)

	css = """
	#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
	"""
	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="
	display: inline-flex;
	align-items: center;
	gap: 0.8rem;
	font-size: 1.75rem;
	"
	>
	<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
	Music To Zeroscope Video
	</h1>
	</div>
	<p style="margin-bottom: 10px; font-size: 94%">
	Sends an audio into <a href="https://huggingface.co/spaces/seungheondoh/LP-Music-Caps-demo" target="_blank">LP-Music-Caps</a>
	to generate a audio caption which is then translated to an illustrative image description with Llama2, and finally run through
	Zeroscope to generate a 3s video from the audio ! <br /><br />
	Note: Only the first 30 seconds of your audio will be used for inference.
	</p>
	</div>""")
	audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
	infer_btn = gr.Button("Generate Video from Music")
	#lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
	llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
	vid_result = gr.Video(label="Image Result")
	tryagain_btn = gr.Button("Try again ?", visible=False)

	gr.Examples(examples=[["./examples/electronic.mp3"],["./examples/folk.wav"], ["./examples/orchestra.wav"]],
	fn=infer,
	inputs=[audio_input],
	outputs=[vid_result, llama_trans_cap, tryagain_btn],
	cache_examples=True
	)

	#infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
	infer_btn.click(fn=infer, inputs=[audio_input], outputs=[vid_result, llama_trans_cap, tryagain_btn])
	tryagain_btn.click(fn=solo_zrscp, inputs=[llama_trans_cap], outputs=[vid_result])

	demo.queue(max_size=20).launch()