Laronix_ASR_TTS_VC / local /ASR_conpare.py
KevinGeng's picture
Update ASR engine to whisper based
f5460b4
#TODO:
# + [x] Load Configuration
# + [ ] Checking
# + [ ] Better saving directory
from pathlib import Path
from transformers import pipeline
import torch.nn as nn
import torch
import torchaudio
import gradio as gr
import sys
# Local imports
sys.path.append("src")
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
# Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# ASR part
data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
audio_files = sorted(list(Path(data_path).glob("**/*wav")))
# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
# TTS part
def load_model(lang, tag, vocoder_tag):
if lang == "Japanese":
if tag == "kan-bayashi/ljspeech_parallel_wavegan":
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
else:
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
vocoder = None if vocoder_tag == "none" else vocoder_tag
elif lang == "English":
# VITS needs no vocoder; others do
if tag == "kan-bayashi/libritts_xvector_vits":
tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
vocoder = None
elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
vocoder = "melgan"
else:
raise ValueError(f"Not supported: lang={lang}, tag={tag}")
else:
raise ValueError(f"Not supported: lang={lang}")
return tts_model, vocoder
tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
tts_model = tts_model.to(device)
vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
# Gradio part
def synthesize(text):
with torch.no_grad():
# Text-to-speech
wav = tts_model(text)[0]
if vocoder is not None:
# Apply vocoder
wav = vocoder.inference(wav)
# Convert to numpy array
wav = wav.squeeze().cpu().numpy()
return wav
interface = gr.Interface(synthesize, inputs="text", outputs="audio")
interface.launch()