youtube-karaoke / model.py
RobbeD's picture
Update model.py
de7d81f verified
import numpy as np
import soundfile as sf
# import torch
from moviepy import AudioFileClip, VideoFileClip
from pydub import AudioSegment
from pytubefix import YouTube
from pytubefix.cli import on_progress
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from source_separation import Predictor
def token_verifier():
visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D"
po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA=="
return (visitor_data, po_token)
def download_from_youtube(url, folder_path):
yt = YouTube(
url,
on_progress_callback=on_progress,
use_po_token=True,
po_token_verifier=token_verifier,
)
print(yt.title)
ys = yt.streams.get_highest_resolution()
ys.download(output_path=folder_path, filename="temp.mp4")
def separate_video_and_audio(video_path, audio_path):
# Load the video clip
video_clip = VideoFileClip(video_path)
# Extract the audio from the video clip
audio_clip = video_clip.audio
# Write the audio to a separate file
audio_clip.write_audiofile(audio_path)
def load_audio(audio_path, sample_rate=44_100):
audio = AudioSegment.from_file(audio_path)
print("Entering the preprocessing of audio")
# Convert the audio file to WAV format
audio = audio.set_frame_rate(sample_rate)
audio = audio.set_sample_width(2) # Set bit depth to 16bit
audio = audio.set_channels(1) # Set to mono
print("Audio file converted to WAV format")
# Calculate the gain to be applied
target_dBFS = -20
gain = target_dBFS - audio.dBFS
print(f"Calculating the gain needed for the audio: {gain} dB")
# Normalize volume and limit gain range to between -3 and 3
normalized_audio = audio.apply_gain(min(max(gain, -3), 3))
waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32)
max_amplitude = np.max(np.abs(waveform))
waveform /= max_amplitude # Normalize
print(f"waveform shape: {waveform.shape}")
print("waveform in np ndarray, dtype=" + str(waveform.dtype))
return waveform, sample_rate
args = {
"model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx",
"denoise": True,
"margin": 44100,
"chunks": 15,
"n_fft": 6144,
"dim_t": 8,
"dim_f": 3072,
}
separate_predictor = Predictor(args=args, device="cpu")
def source_separation(waveform):
"""
Separate the audio into vocals and non-vocals using the given predictor.
Args:
predictor: The separation model predictor.
audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate.
Returns
-------
dict: A dictionary containing the separated vocals and updated audio waveform.
"""
vocals, no_vocals = separate_predictor.predict(waveform)
vocals = vocals[:, 0] # vocals is stereo, only use one channel
no_vocals = no_vocals[:, 0] # no_vocals is stereo, only use one channel
return vocals, no_vocals
def export_to_wav(vocals, no_vocals, sample_rate, folder_path):
"""Export segmented audio to WAV files."""
sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate)
sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate)
def combine_video_and_audio(video_path, no_vocals_path, output_path):
my_clip = VideoFileClip(video_path, audio=False)
audio_background = AudioFileClip(no_vocals_path)
my_clip.audio = audio_background
my_clip.write_videofile(output_path)
# https://www.youtube.com/watch?v=1jZEyU_eO1s
def get_karaoke(url):
folder_path = "data/samples/"
video_path = folder_path + "temp.mp4"
audio_path = folder_path + "temp.mp3"
no_vocals_path = folder_path + "temp_no_vocals.wav"
output_path = folder_path + "result.mp4"
download_from_youtube(url, folder_path)
separate_video_and_audio(video_path, audio_path)
waveform, sample_rate = load_audio(audio_path)
vocals, no_vocals = source_separation(waveform)
export_to_wav(vocals, no_vocals, sample_rate, folder_path)
combine_video_and_audio(video_path, no_vocals_path, output_path)
return output_path