#TODO: # + [x] Load Configuration # + [ ] Checking # + [ ] Better saving directory from pathlib import Path from transformers import pipeline import torch.nn as nn import torch import torchaudio import gradio as gr import sys # Local imports sys.path.append("src") from espnet2.bin.tts_inference import Text2Speech from espnet2.utils.types import str_or_none # Check if GPU is available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # ASR part data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video" audio_files = sorted(list(Path(data_path).glob("**/*wav"))) # audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))) transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1") # TTS part def load_model(lang, tag, vocoder_tag): if lang == "Japanese": if tag == "kan-bayashi/ljspeech_parallel_wavegan": tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan") elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan": tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan") else: raise ValueError(f"Not supported: lang={lang}, tag={tag}") vocoder = None if vocoder_tag == "none" else vocoder_tag elif lang == "English": # VITS needs no vocoder; others do if tag == "kan-bayashi/libritts_xvector_vits": tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits") vocoder = None elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3": tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3") vocoder = "melgan" else: raise ValueError(f"Not supported: lang={lang}, tag={tag}") else: raise ValueError(f"Not supported: lang={lang}") return tts_model, vocoder tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long") tts_model = tts_model.to(device) vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device) # Gradio part def synthesize(text): with torch.no_grad(): # Text-to-speech wav = tts_model(text)[0] if vocoder is not None: # Apply vocoder wav = vocoder.inference(wav) # Convert to numpy array wav = wav.squeeze().cpu().numpy() return wav interface = gr.Interface(synthesize, inputs="text", outputs="audio") interface.launch()