Video-to-SoundFX

Running

App Files Files Community

Video-to-SoundFX / app.py

fffiloni

Update app.py

cafd44e verified 12 months ago

raw

history blame contribute delete

10 kB

	import gradio as gr
	from gradio_client import Client
	import os
	import json
	import re
	from moviepy.editor import *
	import cv2

	hf_token = os.environ.get("HF_TKN")

	def extract_firstframe(video_in):
	vidcap = cv2.VideoCapture(video_in)
	success,image = vidcap.read()
	count = 0
	while success:
	if count == 0:
	cv2.imwrite("first_frame.jpg", image) # save first extracted frame as jpg file named first_frame.jpg
	else:
	break # exit loop after saving first frame
	success,image = vidcap.read()
	print ('Read a new frame: ', success)
	count += 1
	print ("Done extracted first frame!")
	return "first_frame.jpg"

	def extract_audio(video_in):
	input_video = video_in
	output_audio = 'audio.wav'

	# Open the video file and extract the audio
	video_clip = VideoFileClip(input_video)
	audio_clip = video_clip.audio

	# Save the audio as a .wav file
	audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
	print("Audio extraction complete.")

	return 'audio.wav'

	def get_caption_from_kosmos(image_in):
	kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")

	kosmos2_result = kosmos2_client.predict(
	image_in, # str (filepath or URL to image) in 'Test Image' Image component
	"Detailed", # str in 'Description Type' Radio component
	fn_index=4
	)

	print(f"KOSMOS2 RETURNS: {kosmos2_result}")

	with open(kosmos2_result[1], 'r') as f:
	data = json.load(f)

	reconstructed_sentence = []
	for sublist in data:
	reconstructed_sentence.append(sublist[0])

	full_sentence = ' '.join(reconstructed_sentence)
	#print(full_sentence)

	# Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
	pattern = r'^Describe this image in detail:\s(.)$'
	# Apply the regex pattern to extract the description text.
	match = re.search(pattern, full_sentence)
	if match:
	description = match.group(1)
	print(description)
	else:
	print("Unable to locate valid description.")

	# Find the last occurrence of "."
	last_period_index = description.rfind('.')

	# Truncate the string up to the last period
	truncated_caption = description[:last_period_index + 1]

	# print(truncated_caption)
	print(f"\n—\nIMAGE CAPTION: {truncated_caption}")

	return truncated_caption

	def get_caption(image_in):
	client = Client("https://fffiloni-moondream1.hf.space/", hf_token=hf_token)
	result = client.predict(
	image_in, # filepath in 'image' Image component
	"Describe precisely the image in one sentence.", # str in 'Question' Textbox component
	#api_name="/answer_question"
	api_name="/predict"
	)
	print(result)
	return result

	def get_magnet(prompt):
	amended_prompt = f"{prompt}"
	print(amended_prompt)
	client = Client("https://fffiloni-magnet.hf.space/")
	result = client.predict(
	"facebook/audio-magnet-medium", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component
	"", # str in 'Model Path (custom models)' Textbox component
	amended_prompt, # str in 'Input Text' Textbox component
	3, # float in 'Temperature' Number component
	0.9, # float in 'Top-p' Number component
	10, # float in 'Max CFG coefficient' Number component
	1, # float in 'Min CFG coefficient' Number component
	20, # float in 'Decoding Steps (stage 1)' Number component
	10, # float in 'Decoding Steps (stage 2)' Number component
	10, # float in 'Decoding Steps (stage 3)' Number component
	10, # float in 'Decoding Steps (stage 4)' Number component
	"prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component
	api_name="/predict_full"
	)
	print(result)
	return result[1]

	def get_audioldm(prompt):
	client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
	result = client.predict(
	prompt, # str in 'Input text' Textbox component
	"Low quality. Music.", # str in 'Negative prompt' Textbox component
	10, # int \| float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
	3.5, # int \| float (numeric value between 0 and 7) in 'Guidance scale' Slider component
	45, # int \| float in 'Seed' Number component
	3, # int \| float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
	fn_index=1
	)
	print(result)
	audio_result = extract_audio(result)
	return audio_result

	def get_audiogen(prompt):
	client = Client("https://fffiloni-audiogen.hf.space/")
	result = client.predict(
	prompt,
	10,
	api_name="/infer"
	)
	return result

	def get_tango(prompt):
	try:
	client = Client("https://declare-lab-tango.hf.space/")
	except:
	raise gr.Error("Tango space API is not ready, please try again in few minutes ")

	result = client.predict(
	prompt, # str representing string value in 'Prompt' Textbox component
	100, # int \| float representing numeric value between 100 and 200 in 'Steps' Slider component
	4, # int \| float representing numeric value between 1 and 10 in 'Guidance Scale' Slider component
	api_name="/predict"
	)
	print(result)
	return result

	def blend_vsfx(video_in, audio_result):
	audioClip = AudioFileClip(audio_result)
	print(f"AUD: {audioClip.duration}")
	clip = VideoFileClip(video_in)
	print(f"VID: {clip.duration}")
	if clip.duration < audioClip.duration :
	audioClip = audioClip.subclip((0.0), (clip.duration))
	elif clip.duration > audioClip.duration :
	clip = clip.subclip((0.0), (audioClip.duration))
	final_clip = clip.set_audio(audioClip)
	# Set the output codec
	codec = 'libx264'
	audio_codec = 'aac'
	final_clip.write_videofile('final_video_with_sound.mp4', codec=codec, audio_codec=audio_codec)
	return "final_video_with_sound.mp4"

	def infer(video_in, chosen_model):
	image_in = extract_firstframe(video_in)
	caption = get_caption(image_in)
	if chosen_model == "MAGNet" :
	audio_result = get_magnet(caption)
	elif chosen_model == "AudioLDM-2" :
	audio_result = get_audioldm(caption)
	elif chosen_model == "AudioGen" :
	audio_result = get_audiogen(caption)
	elif chosen_model == "Tango" :
	audio_result = get_tango(caption)

	final_res = blend_vsfx(video_in, audio_result)
	return gr.update(value=caption, interactive=True), gr.update(interactive=True), audio_result, final_res

	def retry(edited_prompt, video_in, chosen_model):
	image_in = extract_firstframe(video_in)
	caption = edited_prompt
	if chosen_model == "MAGNet" :
	audio_result = get_magnet(caption)
	elif chosen_model == "AudioLDM-2" :
	audio_result = get_audioldm(caption)
	elif chosen_model == "AudioGen" :
	audio_result = get_audiogen(caption)
	elif chosen_model == "Tango" :
	audio_result = get_tango(caption)

	final_res = blend_vsfx(video_in, audio_result)
	return audio_result, final_res

	def refresh():
	return gr.update(value=None, interactive=False), gr.update(interactive=False), gr.update(value=None), gr.update(value=None)

	css="""
	#col-container{
	margin: 0 auto;
	max-width: 800px;
	}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<h2 style="text-align: center;">
	Video to SoundFX
	</h2>
	<p style="text-align: center;">
	Get sound effects from a video shot while comparing audio models from image caption.
	</p>
	""")

	with gr.Row():

	with gr.Column():
	video_in = gr.Video(sources=["upload"], label="Video input")
	with gr.Row():
	chosen_model = gr.Dropdown(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen", "Tango"], value="Tango")
	submit_btn = gr.Button("Submit", scale=0)
	with gr.Column():
	caption_o = gr.Textbox(label="Scene caption", interactive=False)
	retry_btn = gr.Button("Retry with edited scene caption", interactive=False)
	audio_o = gr.Audio(label="Audio output")
	with gr.Column():
	video_o = gr.Video(label="Video with soundFX")

	gr.Examples(
	examples = [
	["examples/photoreal-train.mp4", "Tango"],
	["examples/train-window.mp4", "Tango"],
	["examples/chinese-new-year-dragon.mp4", "Tango"],
	["examples/big-sur.mp4", "AudioLDM-2"]
	],
	fn=infer,
	inputs = [video_in, chosen_model],
	outputs= [caption_o, retry_btn, audio_o, video_o],
	#cache_examples=True
	)

	video_in.change(
	fn = refresh,
	inputs = None,
	outputs = [caption_o, retry_btn, audio_o, video_o],
	queue = False,
	show_progress = False
	)

	video_in.clear(
	fn = refresh,
	inputs = None,
	outputs = [caption_o, retry_btn, audio_o, video_o],
	queue = False,
	show_progress = False
	)

	submit_btn.click(
	fn=infer,
	inputs=[video_in, chosen_model],
	outputs=[caption_o, retry_btn, audio_o, video_o],
	concurrency_limit = 2
	)

	retry_btn.click(
	fn=retry,
	inputs=[caption_o, video_in, chosen_model],
	outputs=[audio_o, video_o],
	concurrency_limit = 2
	)

	demo.queue(max_size=10).launch(show_api=False, debug=True, show_error=True)