""" TODO: + [x] Load Configuration + [ ] Checking + [ ] Better saving directory """ import numpy as np from pathlib import Path import jiwer import pdb import torch.nn as nn import torch import torchaudio from transformers import pipeline # from time import process_time, time from pathlib import Path import time # local import import sys from espnet2.bin.tts_inference import Text2Speech # pdb.set_trace() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") sys.path.append("src") import gradio as gr # ASR part audio_files = [ str(x) for x in sorted( Path( "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video" ).glob("**/*wav") ) ] # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))] transcriber = pipeline( "automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1", ) old_transcriber = pipeline( "automatic-speech-recognition", "facebook/wav2vec2-base-960h" ) # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1") # 【Female】kan-bayashi ljspeech parallel wavegan # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits") # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder # pdb.set_trace() from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub from fairseq.models.text_to_speech.hub_interface import TTSHubInterface # @title English multi-speaker pretrained model { run: "auto" } lang = "English" tag = "kan-bayashi/libritts_xvector_vits" # vits needs no vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"} from espnet2.bin.tts_inference import Text2Speech from espnet2.utils.types import str_or_none text2speech = Text2Speech.from_pretrained( model_tag=str_or_none(tag), vocoder_tag=str_or_none(vocoder_tag), device="cuda", use_att_constraint=False, backward_window=1, forward_window=3, speed_control_alpha=1.0, ) import glob import os import numpy as np import kaldiio # Get model directory path from espnet_model_zoo.downloader import ModelDownloader d = ModelDownloader() model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"]) # Speaker x-vector selection xvector_ark = [ p for p in glob.glob( f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True ) if "tr" in p ][0] xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)} spks = list(xvectors.keys()) male_spks = { "M1": "2300_131720", "M2": "1320_122612", "M3": "1188_133604", "M4": "61_70970", } female_spks = {"F1": "2961_961", "F2": "8463_287645", "F3": "121_121726"} spks = dict(male_spks, **female_spks) spk_names = sorted(spks.keys()) ## 20230224 Mousa: No reference, def ASRold(audio_file): reg_text = old_transcriber(audio_file)["text"] return reg_text def ASRnew(audio_file, state=""): # pdb.set_trace() time.sleep(2) reg_text = transcriber(audio_file)["text"] state += reg_text + "\n" return state, state def VAD(audio_file): # pdb.set_trace() reg_text = transcriber(audio_file)["text"] return 1 reference_textbox = gr.Textbox( value="", placeholder="Input reference here", label="Reference", ) recognization_textbox = gr.Textbox( value="", placeholder="Output recognization here", label="recognization_textbox", ) speaker_option = gr.Radio(choices=spk_names, label="Speaker") input_audio = gr.Audio( source="upload", type="filepath", label="Audio_to_Evaluate" ) output_audio = gr.Audio( source="upload", file="filepath", label="Synthesized Audio" ) examples = [ ["./samples/001.wav", "M1", ""], ["./samples/002.wav", "M2", ""], ["./samples/003.wav", "F1", ""], ["./samples/004.wav", "F2", ""], ] def change_audiobox(choice): if choice == "upload": input_audio = gr.Audio.update(source="upload", visible=True) elif choice == "microphone": input_audio = gr.Audio.update(source="microphone", visible=True) else: input_audio = gr.Audio.update(visible=False) return input_audio demo = gr.Interface( fn=ASRnew, inputs=[ gr.Audio(source="microphone", type="filepath", streaming=True), "state" ], outputs=[ "textbox", "state" ], live=True) # ASRnew(["/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav", "state"]) # VAD("/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav") demo.launch(share=False)