from svoice.separate import * import scipy.io.wavfile as wav import gradio as gr import os import torch import soundfile as sf from transformers import pipeline from glob import glob load_model() device = "cuda" if torch.cuda.is_available() else "cpu" BASE_PATH = os.path.dirname(os.path.abspath(__file__)) os.makedirs('input', exist_ok=True) os.makedirs('separated', exist_ok=True) print(f"Loading ASR model on {device}...") pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if device == "cuda" else -1) print("ASR model loaded!") def transcribe_audio(audiopath): audio_input, sr = sf.read(audiopath) return pipe(audio_input, max_new_tokens=500)['text'] def separator(audio, rec_audio, example): outputs= {} for f in glob('input/*'): os.remove(f) for f in glob('separated/*'): os.remove(f) if audio: wav.write('input/original.wav', audio[0], audio[1]) elif rec_audio: wav.write('input/original.wav', rec_audio[0], rec_audio[1]) else: os.system(f'cp {example} input/original.wav') separate_demo(mix_dir="./input") separated_files = glob(os.path.join('separated', "*.wav")) separated_files = sorted([f for f in separated_files if "original.wav" not in f]) outputs["transcripts"] = [] for i, f in enumerate(separated_files): print(f"Transcribing separated audio {i+1} ...") outputs["transcripts"].append(transcribe_audio(f)) print("Text:", outputs["transcripts"][-1]) return separated_files + outputs['transcripts'] def set_example_audio(example: list) -> dict: return gr.Audio.update(value=example[0]) demo = gr.Blocks() with demo: gr.Markdown('''
This is a demo for the multiple voice separation algorithm. The algorithm is trained on the LibriMix7 dataset and can be used to separate multiple voices from a single audio file. *This is an intermediate checkpoint just for experimentation purpose. It isn't performing well on 16k sample rate so you can go here svoice_demo to train it on 8k.