Spaces:

RobbeD
/

youtube-karaoke

Sleeping

App Files Files Community

RobbeD commited on Dec 11, 2024

Commit

6c2cba8

verified ·

1 Parent(s): 1786550

initial commit

Browse files

Files changed (12) hide show

.gitattributes +5 -0
data/models/UVR-MDX-NET-Inst_HQ_3.onnx +3 -0
data/samples/result.mp4 +3 -0
data/samples/temp.mp3 +3 -0
data/samples/temp.mp4 +3 -0
data/samples/temp_no_vocals.wav +3 -0
data/samples/temp_vocals.wav +3 -0
demo.py +16 -0
model.py +123 -0
packages.txt +2 -0
requirements.txt +11 -0
source_separation.py +291 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/samples/result.mp4 filter=lfs diff=lfs merge=lfs -text
+data/samples/temp_no_vocals.wav filter=lfs diff=lfs merge=lfs -text
+data/samples/temp_vocals.wav filter=lfs diff=lfs merge=lfs -text
+data/samples/temp.mp3 filter=lfs diff=lfs merge=lfs -text
+data/samples/temp.mp4 filter=lfs diff=lfs merge=lfs -text

data/models/UVR-MDX-NET-Inst_HQ_3.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:317554b07fe1ea5279a77f2b1520a41ea4b93432560c4ffd08792c30fddf9adc
+size 66759214

data/samples/result.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a5c839f552d27b110e7db77ac74cb41a5c51c6c8376a75814aa4fc5a0c5921
+size 16601916

data/samples/temp.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1322f661bf6c9b22a6e30283933f223358ad68fab06d73017cb80363e6e3ff50
+size 4749941

data/samples/temp.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:302dd0780f1420599fa5bc179eb766981aac39883b4b79f8f0273f94d11d2542
+size 14761845

data/samples/temp_no_vocals.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96ea44ba19641369a63e5ab8ec403e204b88e7aab35b7670f6af2b6811d912de
+size 26179568

data/samples/temp_vocals.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:087b4afcc655ab2b0c0e25e196ee559bb661c996438ff897e5ef671cd51f4564
+size 26179568

demo.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import gradio as gr
+from youtube_karaoke.model import get_karaoke
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(), gr.Row():
+            url = gr.Textbox(placeholder="Youtube video URL", label="URL")
+        with gr.Column():
+            outputs = gr.PlayableVideo()
+    transcribe_btn = gr.Button("YouTube Karaoke")
+    transcribe_btn.click(get_karaoke, inputs=url, outputs=outputs)
+demo.launch(debug=True)

model.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import numpy as np
+import soundfile as sf
+# import torch
+from moviepy import AudioFileClip, VideoFileClip
+from pydub import AudioSegment
+from pytubefix import YouTube
+from pytubefix.cli import on_progress
+# from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from youtube_karaoke.source_separation import Predictor
+def download_from_youtube(url, folder_path):
+    yt = YouTube(url, on_progress_callback=on_progress)
+    print(yt.title)
+    ys = yt.streams.get_highest_resolution()
+    ys.download(output_path=folder_path, filename="temp.mp4")
+def separate_video_and_audio(video_path, audio_path):
+    # Load the video clip
+    video_clip = VideoFileClip(video_path)
+    # Extract the audio from the video clip
+    audio_clip = video_clip.audio
+    # Write the audio to a separate file
+    audio_clip.write_audiofile(audio_path)
+def load_audio(audio_path, sample_rate=44_100):
+    audio = AudioSegment.from_file(audio_path)
+    print("Entering the preprocessing of audio")
+    # Convert the audio file to WAV format
+    audio = audio.set_frame_rate(sample_rate)
+    audio = audio.set_sample_width(2)  # Set bit depth to 16bit
+    audio = audio.set_channels(1)  # Set to mono
+    print("Audio file converted to WAV format")
+    # Calculate the gain to be applied
+    target_dBFS = -20
+    gain = target_dBFS - audio.dBFS
+    print(f"Calculating the gain needed for the audio: {gain} dB")
+    # Normalize volume and limit gain range to between -3 and 3
+    normalized_audio = audio.apply_gain(min(max(gain, -3), 3))
+    waveform = np.array(normalized_audio.get_array_of_samples(), dtype=np.float32)
+    max_amplitude = np.max(np.abs(waveform))
+    waveform /= max_amplitude  # Normalize
+    print(f"waveform shape: {waveform.shape}")
+    print("waveform in np ndarray, dtype=" + str(waveform.dtype))
+    return waveform, sample_rate
+args = {
+    "model_path": "data/models/UVR-MDX-NET-Inst_HQ_3.onnx",
+    "denoise": True,
+    "margin": 44100,
+    "chunks": 15,
+    "n_fft": 6144,
+    "dim_t": 8,
+    "dim_f": 3072,
+}
+separate_predictor = Predictor(args=args, device="cpu")
+def source_separation(waveform):
+    """
+    Separate the audio into vocals and non-vocals using the given predictor.
+    Args:
+        predictor: The separation model predictor.
+        audio (str or dict): The audio file path or a dictionary containing audio waveform and sample rate.
+    Returns
+    -------
+        dict: A dictionary containing the separated vocals and updated audio waveform.
+    """
+    vocals, no_vocals = separate_predictor.predict(waveform)
+    vocals = vocals[:, 0]  # vocals is stereo, only use one channel
+    no_vocals = no_vocals[:, 0]  # no_vocals is stereo, only use one channel
+    return vocals, no_vocals
+def export_to_wav(vocals, no_vocals, sample_rate, folder_path):
+    """Export segmented audio to WAV files."""
+    sf.write(folder_path + "temp_vocals.wav", vocals, sample_rate)
+    sf.write(folder_path + "temp_no_vocals.wav", no_vocals, sample_rate)
+def combine_video_and_audio(video_path, no_vocals_path, output_path):
+    my_clip = VideoFileClip(video_path, audio=False)
+    audio_background = AudioFileClip(no_vocals_path)
+    my_clip.audio = audio_background
+    my_clip.write_videofile(output_path)
+# https://www.youtube.com/watch?v=1jZEyU_eO1s
+def get_karaoke(url):
+    folder_path = "data/samples/"
+    video_path = folder_path + "temp.mp4"
+    audio_path = folder_path + "temp.mp3"
+    no_vocals_path = folder_path + "temp_no_vocals.wav"
+    output_path = folder_path + "result.mp4"
+    download_from_youtube(url, folder_path)
+    separate_video_and_audio(video_path, audio_path)
+    waveform, sample_rate = load_audio(audio_path)
+    vocals, no_vocals = source_separation(waveform)
+    export_to_wav(vocals, no_vocals, sample_rate, folder_path)
+    combine_video_and_audio(video_path, no_vocals_path, output_path)
+    return output_path

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+onnxruntime=="1.20.1"
+torch=="2.5.1"
+tqdm=="4.67.1"
+llvmlite=="0.43.0"
+librosa=="0.10.2.post1"
+pydub=="0.25.1"
+transformers=="4.47.0"
+pytubefix=="8.8.1"
+accelerate=="1.2.0"
+moviepy=="2.1.1"
+gradio=="5.8.0"

source_separation.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# Copyright (c) 2023 seanghay
+#
+# This code is from an unliscensed repository.
+#
+# Note: This code has been modified to fit the context of this repository.
+#       This code is included in an MIT-licensed repository.
+#       The repository's MIT license does not apply to this code.
+# This code is modified from https://github.com/seanghay/uvr-mdx-infer/blob/main/separate.py
+import numpy as np
+import onnxruntime as ort
+import torch
+from tqdm import tqdm
+class ConvTDFNet:
+    """
+    ConvTDFNet - Convolutional Temporal Frequency Domain Network.
+    """
+    def __init__(self, target_name, L, dim_f, dim_t, n_fft, hop=1024):
+        """
+        Initialize ConvTDFNet.
+        Args:
+            target_name (str): The target name for separation.
+            L (int): Number of layers.
+            dim_f (int): Dimension in the frequency domain.
+            dim_t (int): Dimension in the time domain (log2).
+            n_fft (int): FFT size.
+            hop (int, optional): Hop size. Defaults to 1024.
+        Returns
+        -------
+            None
+        """
+        super(ConvTDFNet, self).__init__()
+        self.dim_c = 4
+        self.dim_f = dim_f
+        self.dim_t = 2**dim_t
+        self.n_fft = n_fft
+        self.hop = hop
+        self.n_bins = self.n_fft // 2 + 1
+        self.chunk_size = hop * (self.dim_t - 1)
+        self.window = torch.hann_window(window_length=self.n_fft, periodic=True)
+        self.target_name = target_name
+        out_c = self.dim_c * 4 if target_name == "*" else self.dim_c
+        self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t])
+        self.n = L // 2
+    def stft(self, x):
+        """
+        Perform Short-Time Fourier Transform (STFT).
+        Args:
+            x (torch.Tensor): Input waveform.
+        Returns
+        -------
+            torch.Tensor: STFT of the input waveform.
+        """
+        x = x.reshape([-1, self.chunk_size])
+        x = torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop,
+            window=self.window,
+            center=True,
+            return_complex=True,
+        )
+        x = torch.view_as_real(x)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+            [-1, self.dim_c, self.n_bins, self.dim_t]
+        )
+        return x[:, :, : self.dim_f]
+    def istft(self, x, freq_pad=None):
+        """
+        Perform Inverse Short-Time Fourier Transform (ISTFT).
+        Args:
+            x (torch.Tensor): Input STFT.
+            freq_pad (torch.Tensor, optional): Frequency padding. Defaults to None.
+        Returns
+        -------
+            torch.Tensor: Inverse STFT of the input.
+        """
+        freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad
+        x = torch.cat([x, freq_pad], -2)
+        c = 4 * 2 if self.target_name == "*" else 2
+        x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t])
+        x = x.permute([0, 2, 3, 1])
+        x = x.contiguous()
+        x = torch.view_as_complex(x)
+        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
+        return x.reshape([-1, c, self.chunk_size])
+class Predictor:
+    """
+    Predictor class for source separation using ConvTDFNet and ONNX Runtime.
+    """
+    def __init__(self, args, device):
+        """
+        Initialize the Predictor.
+        Args:
+            args (dict): Configuration arguments.
+            device (str): Device to run the model ('cuda' or 'cpu').
+        Returns
+        -------
+            None
+        Raises
+        ------
+            ValueError: If the provided device is not 'cuda' or 'cpu'.
+        """
+        self.args = args
+        self.model_ = ConvTDFNet(
+            target_name="vocals",
+            L=11,
+            dim_f=args["dim_f"],
+            dim_t=args["dim_t"],
+            n_fft=args["n_fft"],
+        )
+        if device == "cuda":
+            self.model = ort.InferenceSession(
+                args["model_path"], providers=["CUDAExecutionProvider"]
+            )
+        elif device == "cpu":
+            self.model = ort.InferenceSession(
+                args["model_path"], providers=["CPUExecutionProvider"]
+            )
+        else:
+            raise ValueError("Device must be either 'cuda' or 'cpu'")
+    def demix(self, mix):
+        """
+        Separate the sources from the input mix.
+        Args:
+            mix (np.ndarray): Input mixture signal.
+        Returns
+        -------
+            np.ndarray: Separated sources.
+        Raises
+        ------
+            AssertionError: If margin is zero.
+        """
+        samples = mix.shape[-1]
+        margin = self.args["margin"]
+        chunk_size = self.args["chunks"] * 44100
+        assert margin != 0, "Margin cannot be zero!"
+        margin = min(margin, chunk_size)
+        segmented_mix = {}
+        if self.args["chunks"] == 0 or samples < chunk_size:
+            chunk_size = samples
+        counter = -1
+        for skip in range(0, samples, chunk_size):
+            counter += 1
+            s_margin = 0 if counter == 0 else margin
+            end = min(skip + chunk_size + margin, samples)
+            start = skip - s_margin
+            segmented_mix[skip] = mix[:, start:end].copy()
+            if end == samples:
+                break
+        sources = self.demix_base(segmented_mix, margin_size=margin)
+        return sources
+    def demix_base(self, mixes, margin_size):
+        """
+        Base function for source separation.
+        Args:
+            mixes (dict): Dictionary of segmented mixtures.
+            margin_size (int): Size of the margin.
+        Returns
+        -------
+            np.ndarray: Separated sources.
+        """
+        chunked_sources = []
+        progress_bar = tqdm(total=len(mixes))
+        progress_bar.set_description("Source separation")
+        for mix in mixes:
+            cmix = mixes[mix]
+            sources = []
+            n_sample = cmix.shape[1]
+            model = self.model_
+            trim = model.n_fft // 2
+            gen_size = model.chunk_size - 2 * trim
+            pad = gen_size - n_sample % gen_size
+            mix_p = np.concatenate(
+                (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
+            )
+            mix_waves = []
+            i = 0
+            while i < n_sample + pad:
+                waves = np.array(mix_p[:, i : i + model.chunk_size])
+                mix_waves.append(waves)
+                i += gen_size
+            mix_waves = torch.tensor(np.array(mix_waves), dtype=torch.float32)
+            with torch.no_grad():
+                _ort = self.model
+                spek = model.stft(mix_waves)
+                if self.args["denoise"]:
+                    spec_pred = (
+                        -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
+                        + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
+                    )
+                    tar_waves = model.istft(torch.tensor(spec_pred))
+                else:
+                    tar_waves = model.istft(
+                        torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
+                    )
+                tar_signal = (
+                    tar_waves[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).numpy()[:, :-pad]
+                )
+                start = 0 if mix == 0 else margin_size
+                end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
+                if margin_size == 0:
+                    end = None
+                sources.append(tar_signal[:, start:end])
+                progress_bar.update(1)
+            chunked_sources.append(sources)
+        _sources = np.concatenate(chunked_sources, axis=-1)
+        progress_bar.close()
+        return _sources
+    def predict(self, mix):
+        """
+        Predict the separated sources from the input mix.
+        Args:
+            mix (np.ndarray): Input mixture signal.
+        Returns
+        -------
+            tuple: Tuple containing the mixture minus the separated sources and the separated sources.
+        """
+        if mix.ndim == 1:
+            mix = np.asfortranarray([mix, mix])
+        tail = mix.shape[1] % (self.args["chunks"] * 44100)
+        if mix.shape[1] % (self.args["chunks"] * 44100) != 0:
+            mix = np.pad(
+                mix,
+                (
+                    (0, 0),
+                    (
+                        0,
+                        self.args["chunks"] * 44100 - mix.shape[1] % (self.args["chunks"] * 44100),
+                    ),
+                ),
+            )
+        mix = mix.T
+        sources = self.demix(mix.T)
+        opt = sources[0].T
+        if tail != 0:
+            return (
+                (mix - opt)[: -(self.args["chunks"] * 44100 - tail), :],
+                opt[: -(self.args["chunks"] * 44100 - tail), :],
+            )
+        return ((mix - opt), opt)