Spaces:
Running
Running
import numpy as np | |
import soundfile as sf | |
# import torch | |
from moviepy import AudioFileClip, VideoFileClip | |
from pydub import AudioSegment | |
from pytubefix import YouTube | |
from pytubefix.cli import on_progress | |
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
from source_separation import Predictor | |
def token_verifier(): | |
visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D" | |
po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA==" | |
return (visitor_data, po_token) | |
def download_from_youtube(url, folder_path): | |
yt = YouTube( | |
url, | |
on_progress_callback=on_progress, | |
use_po_token=True, | |
po_token_verifier=token_verifier, | |
) | |
print(yt.title) | |
ys = yt.streams.get_highest_resolution() | |
ys.download(output_path=folder_path, filename="temp.mp4") | |
def separate_video_and_audio(video_path, audio_path): | |
# Load the video clip | |
video_clip = VideoFileClip(video_path) | |
# Extract the audio from the video clip | |
audio_clip = video_clip.audio | |
# Write the audio to a separate file | |
audio_clip.write_audiofile(audio_path) | |
def load_audio(audio_path, sample_rate=44_100): | |
audio = AudioSegment.from_file(audio_path) | |
print("Entering the preprocessing of audio") | |
# Convert the audio file to WAV format | |
audio = audio.set_frame_rate(sample_rate) | |
audio = audio.set_sample_width(2) # Set bit depth to 16bit | |
audio = audio.set_channels(1) # Set to mono | |
print("Audio file converted to WAV format") | |
# Calculate the gain to be applied | |
target_dBFS = -20 | |
gain = target_dBFS - audio.dBFS | |
print(f"Calculating the gain needed for the audio: {gain} dB") | |
# Normalize volume and limit gain range to between -3 and 3 | |
normalized_audio = audio.apply_gain(min(max(gain, -3), 3)) | |
waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32) | |
max_amplitude = np.max(np.abs(waveform)) | |
waveform /= max_amplitude # Normalize | |
print(f"waveform shape: {waveform.shape}") | |
print("waveform in np ndarray, dtype=" + str(waveform.dtype)) | |
return waveform, sample_rate | |
args = { | |
"model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx", | |
"denoise": True, | |
"margin": 44100, | |
"chunks": 15, | |
"n_fft": 6144, | |
"dim_t": 8, | |
"dim_f": 3072, | |
} | |
separate_predictor = Predictor(args=args, device="cpu") | |
def source_separation(waveform): | |
""" | |
Separate the audio into vocals and non-vocals using the given predictor. | |
Args: | |
predictor: The separation model predictor. | |
audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate. | |
Returns | |
------- | |
dict: A dictionary containing the separated vocals and updated audio waveform. | |
""" | |
vocals, no_vocals = separate_predictor.predict(waveform) | |
vocals = vocals[:, 0] # vocals is stereo, only use one channel | |
no_vocals = no_vocals[:, 0] # no_vocals is stereo, only use one channel | |
return vocals, no_vocals | |
def export_to_wav(vocals, no_vocals, sample_rate, folder_path): | |
"""Export segmented audio to WAV files.""" | |
sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate) | |
sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate) | |
def combine_video_and_audio(video_path, no_vocals_path, output_path): | |
my_clip = VideoFileClip(video_path, audio=False) | |
audio_background = AudioFileClip(no_vocals_path) | |
my_clip.audio = audio_background | |
my_clip.write_videofile(output_path) | |
# https://www.youtube.com/watch?v=1jZEyU_eO1s | |
def get_karaoke(url): | |
folder_path = "data/samples/" | |
video_path = folder_path + "temp.mp4" | |
audio_path = folder_path + "temp.mp3" | |
no_vocals_path = folder_path + "temp_no_vocals.wav" | |
output_path = folder_path + "result.mp4" | |
download_from_youtube(url, folder_path) | |
separate_video_and_audio(video_path, audio_path) | |
waveform, sample_rate = load_audio(audio_path) | |
vocals, no_vocals = source_separation(waveform) | |
export_to_wav(vocals, no_vocals, sample_rate, folder_path) | |
combine_video_and_audio(video_path, no_vocals_path, output_path) | |
return output_path | |