Spaces:
Running
Running
File size: 4,302 Bytes
6c2cba8 db277a2 6c2cba8 de7d81f 6c2cba8 de7d81f 6c2cba8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import numpy as np
import soundfile as sf
# import torch
from moviepy import AudioFileClip, VideoFileClip
from pydub import AudioSegment
from pytubefix import YouTube
from pytubefix.cli import on_progress
# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from source_separation import Predictor
def token_verifier():
visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D"
po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA=="
return (visitor_data, po_token)
def download_from_youtube(url, folder_path):
yt = YouTube(
url,
on_progress_callback=on_progress,
use_po_token=True,
po_token_verifier=token_verifier,
)
print(yt.title)
ys = yt.streams.get_highest_resolution()
ys.download(output_path=folder_path, filename="temp.mp4")
def separate_video_and_audio(video_path, audio_path):
# Load the video clip
video_clip = VideoFileClip(video_path)
# Extract the audio from the video clip
audio_clip = video_clip.audio
# Write the audio to a separate file
audio_clip.write_audiofile(audio_path)
def load_audio(audio_path, sample_rate=44_100):
audio = AudioSegment.from_file(audio_path)
print("Entering the preprocessing of audio")
# Convert the audio file to WAV format
audio = audio.set_frame_rate(sample_rate)
audio = audio.set_sample_width(2) # Set bit depth to 16bit
audio = audio.set_channels(1) # Set to mono
print("Audio file converted to WAV format")
# Calculate the gain to be applied
target_dBFS = -20
gain = target_dBFS - audio.dBFS
print(f"Calculating the gain needed for the audio: {gain} dB")
# Normalize volume and limit gain range to between -3 and 3
normalized_audio = audio.apply_gain(min(max(gain, -3), 3))
waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32)
max_amplitude = np.max(np.abs(waveform))
waveform /= max_amplitude # Normalize
print(f"waveform shape: {waveform.shape}")
print("waveform in np ndarray, dtype=" + str(waveform.dtype))
return waveform, sample_rate
args = {
"model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx",
"denoise": True,
"margin": 44100,
"chunks": 15,
"n_fft": 6144,
"dim_t": 8,
"dim_f": 3072,
}
separate_predictor = Predictor(args=args, device="cpu")
def source_separation(waveform):
"""
Separate the audio into vocals and non-vocals using the given predictor.
Args:
predictor: The separation model predictor.
audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate.
Returns
-------
dict: A dictionary containing the separated vocals and updated audio waveform.
"""
vocals, no_vocals = separate_predictor.predict(waveform)
vocals = vocals[:, 0] # vocals is stereo, only use one channel
no_vocals = no_vocals[:, 0] # no_vocals is stereo, only use one channel
return vocals, no_vocals
def export_to_wav(vocals, no_vocals, sample_rate, folder_path):
"""Export segmented audio to WAV files."""
sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate)
sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate)
def combine_video_and_audio(video_path, no_vocals_path, output_path):
my_clip = VideoFileClip(video_path, audio=False)
audio_background = AudioFileClip(no_vocals_path)
my_clip.audio = audio_background
my_clip.write_videofile(output_path)
# https://www.youtube.com/watch?v=1jZEyU_eO1s
def get_karaoke(url):
folder_path = "data/samples/"
video_path = folder_path + "temp.mp4"
audio_path = folder_path + "temp.mp3"
no_vocals_path = folder_path + "temp_no_vocals.wav"
output_path = folder_path + "result.mp4"
download_from_youtube(url, folder_path)
separate_video_and_audio(video_path, audio_path)
waveform, sample_rate = load_audio(audio_path)
vocals, no_vocals = source_separation(waveform)
export_to_wav(vocals, no_vocals, sample_rate, folder_path)
combine_video_and_audio(video_path, no_vocals_path, output_path)
return output_path
|