File size: 4,302 Bytes
6c2cba8
 
 
 
 
 
 
 
 
 
db277a2
6c2cba8
 
de7d81f
 
 
 
 
 
6c2cba8
de7d81f
 
 
 
 
 
6c2cba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
import soundfile as sf

# import torch
from moviepy import AudioFileClip, VideoFileClip
from pydub import AudioSegment
from pytubefix import YouTube
from pytubefix.cli import on_progress

# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from source_separation import Predictor


def token_verifier():
    visitor_data = "CgtkTUVqS2hIcUR3SSjm-ee6BjIKCgJVUxIEGgAgIA%3D%3D"
    po_token = "MnSwBAM1XYDp6bA8Z_JBCNCoNW8B0QpC-m_9mKWKsG5JWIjKIGCD2GZDzXXoz41VM9SGWki1uE1KmqxAu9rYVLoTChUn_wlHYvQ5GMmpZLtF1sbo5zeWzhSALHMSrjxIhGV5-xPF3QfwVU-TbY8MUGZKXvSlRA=="
    return (visitor_data, po_token)


def download_from_youtube(url, folder_path):
    yt = YouTube(
        url,
        on_progress_callback=on_progress,
        use_po_token=True,
        po_token_verifier=token_verifier,
    )
    print(yt.title)

    ys = yt.streams.get_highest_resolution()
    ys.download(output_path=folder_path, filename="temp.mp4")


def separate_video_and_audio(video_path, audio_path):
    # Load the video clip
    video_clip = VideoFileClip(video_path)

    # Extract the audio from the video clip
    audio_clip = video_clip.audio

    # Write the audio to a separate file
    audio_clip.write_audiofile(audio_path)


def load_audio(audio_path, sample_rate=44_100):
    audio = AudioSegment.from_file(audio_path)

    print("Entering the preprocessing of audio")

    # Convert the audio file to WAV format
    audio = audio.set_frame_rate(sample_rate)
    audio = audio.set_sample_width(2)  # Set bit depth to 16bit
    audio = audio.set_channels(1)  # Set to mono

    print("Audio file converted to WAV format")

    # Calculate the gain to be applied
    target_dBFS = -20
    gain = target_dBFS - audio.dBFS
    print(f"Calculating the gain needed for the audio: {gain} dB")

    # Normalize volume and limit gain range to between -3 and 3
    normalized_audio = audio.apply_gain(min(max(gain, -3), 3))

    waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32)
    max_amplitude = np.max(np.abs(waveform))
    waveform /= max_amplitude  # Normalize

    print(f"waveform shape: {waveform.shape}")
    print("waveform in np ndarray, dtype=" + str(waveform.dtype))

    return waveform, sample_rate


args = {
    "model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx",
    "denoise": True,
    "margin": 44100,
    "chunks": 15,
    "n_fft": 6144,
    "dim_t": 8,
    "dim_f": 3072,
}

separate_predictor = Predictor(args=args, device="cpu")


def source_separation(waveform):
    """
    Separate the audio into vocals and non-vocals using the given predictor.

    Args:
        predictor: The separation model predictor.
        audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate.

    Returns
    -------
        dict: A dictionary containing the separated vocals and updated audio waveform.
    """
    vocals, no_vocals = separate_predictor.predict(waveform)

    vocals = vocals[:, 0]  # vocals is stereo, only use one channel
    no_vocals = no_vocals[:, 0]  # no_vocals is stereo, only use one channel

    return vocals, no_vocals


def export_to_wav(vocals, no_vocals, sample_rate, folder_path):
    """Export segmented audio to WAV files."""
    sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate)
    sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate)


def combine_video_and_audio(video_path, no_vocals_path, output_path):
    my_clip = VideoFileClip(video_path, audio=False)
    audio_background = AudioFileClip(no_vocals_path)
    my_clip.audio = audio_background
    my_clip.write_videofile(output_path)


# https://www.youtube.com/watch?v=1jZEyU_eO1s
def get_karaoke(url):
    folder_path = "data/samples/"
    video_path = folder_path + "temp.mp4"
    audio_path = folder_path + "temp.mp3"
    no_vocals_path = folder_path + "temp_no_vocals.wav"
    output_path = folder_path + "result.mp4"

    download_from_youtube(url, folder_path)
    separate_video_and_audio(video_path, audio_path)
    waveform, sample_rate = load_audio(audio_path)
    vocals, no_vocals = source_separation(waveform)
    export_to_wav(vocals, no_vocals, sample_rate, folder_path)
    combine_video_and_audio(video_path, no_vocals_path, output_path)
    return output_path