Spaces:

RobbeD
/

youtube-karaoke

Running

App Files Files Community

youtube-karaoke / model.py

RobbeD

Update model.py

de7d81f verified about 2 months ago

raw

history blame contribute delete

4.3 kB

	import numpy as np
	import soundfile as sf

	# import torch
	from moviepy import AudioFileClip, VideoFileClip
	from pydub import AudioSegment
	from pytubefix import YouTube
	from pytubefix.cli import on_progress

	# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from source_separation import Predictor


	def token_verifier():
	visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D"
	po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA=="
	return (visitor_data, po_token)


	def download_from_youtube(url, folder_path):
	yt = YouTube(
	url,
	on_progress_callback=on_progress,
	use_po_token=True,
	po_token_verifier=token_verifier,
	)
	print(yt.title)

	ys = yt.streams.get_highest_resolution()
	ys.download(output_path=folder_path, filename="temp.mp4")


	def separate_video_and_audio(video_path, audio_path):
	# Load the video clip
	video_clip = VideoFileClip(video_path)

	# Extract the audio from the video clip
	audio_clip = video_clip.audio

	# Write the audio to a separate file
	audio_clip.write_audiofile(audio_path)


	def load_audio(audio_path, sample_rate=44_100):
	audio = AudioSegment.from_file(audio_path)

	print("Entering the preprocessing of audio")

	# Convert the audio file to WAV format
	audio = audio.set_frame_rate(sample_rate)
	audio = audio.set_sample_width(2) # Set bit depth to 16bit
	audio = audio.set_channels(1) # Set to mono

	print("Audio file converted to WAV format")

	# Calculate the gain to be applied
	target_dBFS = -20
	gain = target_dBFS - audio.dBFS
	print(f"Calculating the gain needed for the audio: {gain} dB")

	# Normalize volume and limit gain range to between -3 and 3
	normalized_audio = audio.apply_gain(min(max(gain, -3), 3))

	waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32)
	max_amplitude = np.max(np.abs(waveform))
	waveform /= max_amplitude # Normalize

	print(f"waveform shape: {waveform.shape}")
	print("waveform in np ndarray, dtype=" + str(waveform.dtype))

	return waveform, sample_rate


	args = {
	"model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx",
	"denoise": True,
	"margin": 44100,
	"chunks": 15,
	"n_fft": 6144,
	"dim_t": 8,
	"dim_f": 3072,
	}

	separate_predictor = Predictor(args=args, device="cpu")


	def source_separation(waveform):
	"""
	Separate the audio into vocals and non-vocals using the given predictor.

	Args:
	predictor: The separation model predictor.
	audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate.

	Returns
	-------
	dict: A dictionary containing the separated vocals and updated audio waveform.
	"""
	vocals, no_vocals = separate_predictor.predict(waveform)

	vocals = vocals[:, 0] # vocals is stereo, only use one channel
	no_vocals = no_vocals[:, 0] # no_vocals is stereo, only use one channel

	return vocals, no_vocals


	def export_to_wav(vocals, no_vocals, sample_rate, folder_path):
	"""Export segmented audio to WAV files."""
	sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate)
	sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate)


	def combine_video_and_audio(video_path, no_vocals_path, output_path):
	my_clip = VideoFileClip(video_path, audio=False)
	audio_background = AudioFileClip(no_vocals_path)
	my_clip.audio = audio_background
	my_clip.write_videofile(output_path)


	# https://www.youtube.com/watch?v=1jZEyU_eO1s
	def get_karaoke(url):
	folder_path = "data/samples/"
	video_path = folder_path + "temp.mp4"
	audio_path = folder_path + "temp.mp3"
	no_vocals_path = folder_path + "temp_no_vocals.wav"
	output_path = folder_path + "result.mp4"

	download_from_youtube(url, folder_path)
	separate_video_and_audio(video_path, audio_path)
	waveform, sample_rate = load_audio(audio_path)
	vocals, no_vocals = source_separation(waveform)
	export_to_wav(vocals, no_vocals, sample_rate, folder_path)
	combine_video_and_audio(video_path, no_vocals_path, output_path)
	return output_path