import gradio as gr |
import git |
import os |
os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS') |
os.system('pip install -q -e TTS/') |
os.system('pip install -q torchaudio==0.9.0') |
os.system('pip install voicefixer --upgrade') |
from voicefixer import VoiceFixer |
voicefixer = VoiceFixer() |
import sys |
sys.path.append(TTS_PATH) |
import os |
import string |
import time |
import argparse |
import json |
import numpy as np |
import IPython |
from IPython.display import Audio |
import torch |
import torchaudio |
from speechbrain.pretrained import SpectralMaskEnhancement |
import whisper |
model1 = whisper.load_model("tiny") |
import openai |
enhance_model = SpectralMaskEnhancement.from_hparams( |
source="speechbrain/metricgan-plus-voicebank", |
savedir="pretrained_models/metricgan-plus-voicebank", |
run_opts={"device":"cuda"}, |
) |
mes = [ |
{"role": "system", "content": "You are my personal assistant. Try to be helpful."} |
] |
res = [] |
from TTS.tts.utils.synthesis import synthesis |
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols |
try: |
from TTS.utils.audio import AudioProcessor |
except: |
from TTS.utils.audio import AudioProcessor |
from TTS.tts.models import setup_model |
from TTS.config import load_config |
from TTS.tts.models.vits import * |
OUT_PATH = 'out/' |
os.makedirs(OUT_PATH, exist_ok=True) |
MODEL_PATH = '/home/user/app/best_model_latest.pth.tar' |
CONFIG_PATH = '/home/user/app/config.json' |
TTS_LANGUAGES = "/home/user/app/language_ids.json" |
TTS_SPEAKERS = "/home/user/app/speakers.json" |
USE_CUDA = torch.cuda.is_available() |
C = load_config(CONFIG_PATH) |
ap = AudioProcessor(**C.audio) |
speaker_embedding = None |
C.model_args['d_vector_file'] = TTS_SPEAKERS |
C.model_args['use_speaker_encoder_as_loss'] = False |
model = setup_model(C) |
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) |
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) |
model_weights = cp['model'].copy() |
for key in list(model_weights.keys()): |
if "speaker_encoder" in key: |
del model_weights[key] |
model.load_state_dict(model_weights) |
model.eval() |
if USE_CUDA: |
model = model.cuda() |
use_griffin_lim = False |
os.system('pip install -q pydub ffmpeg-normalize') |
CONFIG_SE_PATH = "config_se.json" |
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" |
from TTS.tts.utils.speakers import SpeakerManager |
from pydub import AudioSegment |
import librosa |
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) |
def compute_spec(ref_file): |
y, sr = librosa.load(ref_file, sr=ap.sample_rate) |
spec = ap.spectrogram(y) |
spec = torch.FloatTensor(spec).unsqueeze(0) |
return spec |
def greet(Text2, audio, Voicetoclone,VoiceMicrophone): |
openai.api_key = Text2 |
audio = whisper.load_audio(audio) |
audio = whisper.pad_or_trim(audio) |
mel = whisper.log_mel_spectrogram(audio).to(model1.device) |
_, probs = model1.detect_language(mel) |
print(f"Detected language: {max(probs, key=probs.get)}") |
options = whisper.DecodingOptions() |
result = whisper.decode(model1, mel, options) |
res.append(result.text) |
messages = mes |
n = len(res) |
content = res[n-1] |
messages.append({"role": "user", "content": content}) |
completion = openai.ChatCompletion.create( |
model = "gpt-3.5-turbo", |
messages = messages |
) |
chat_response = completion.choices[0].message.content |
messages.append({"role": "assistant", "content": chat_response}) |
text= "%s" % (chat_response + "") |
if Voicetoclone is not None: |
reference_files= "%s" % (Voicetoclone) |
print("path url") |
print(Voicetoclone) |
sample= str(Voicetoclone) |
else: |
reference_files= "%s" % (VoiceMicrophone) |
print("path url") |
print(VoiceMicrophone) |
sample= str(VoiceMicrophone) |
size= len(reference_files)*sys.getsizeof(reference_files) |
size2= size / 1000000 |
if (size2 > 0.012) or len(text)>2000: |
message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes." |
print(message) |
raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.") |
else: |
os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f') |
reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files) |
model.length_scale = 1 |
model.inference_noise_scale = 0.3 |
model.inference_noise_scale_dp = 0.3 |
text = text |
model.language_manager.language_id_mapping |
language_id = 0 |
print(" > text: {}".format(text)) |
wav, alignment, _, _ = synthesis( |
model, |
text, |
C, |
"cuda" in str(next(model.parameters()).device), |
ap, |
speaker_id=None, |
d_vector=reference_emb, |
style_wav=None, |
language_id=language_id, |
enable_eos_bos_chars=C.enable_eos_bos_chars, |
use_griffin_lim=True, |
do_trim_silence=False, |
).values() |
print("Generated Audio") |
IPython.display.display(Audio(wav, rate=ap.sample_rate)) |
file_name="Audio.wav" |
out_path = os.path.join(OUT_PATH, file_name) |
print(" > Saving output to {}".format(out_path)) |
ap.save_wav(wav, out_path) |
voicefixer.restore(input=out_path, |
output="audio1.wav", |
cuda=True, |
mode = 0) |
noisy = enhance_model.load_audio( |
"audio1.wav" |
).unsqueeze(0) |
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) |
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) |
return [result.text, chat_response, "enhanced.wav"] |
gr.Interface( |
title = '🥳💬💕 - TalktoAI,随时随地,谈天说地!', |
theme="huggingface", |
description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!", |
fn=greet, |
inputs=[ |
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"), |
gr.Audio(source="microphone", label='在这里进行对话', type="filepath"), |
gr.Audio(source="upload", type="filepath", label='请上传您喜欢的声音(wav文件)'). |
gr.Audio(source="microphone", type="filepath", label = "请用麦克风上传您喜欢的声音,与wav文件二选一即可"), |
], |
outputs=[ |
gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"), |
], |
).launch(show_error = True) |
demo = gr.Interface( |
fn=greet, |
inputs=[ |
gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"), |
gr.Audio(source="microphone", label='在这里进行对话', type="filepath"), |
gr.Audio(source="upload", type="filepath", label='请上传您喜欢的声音(wav文件)'). |
gr.Audio(source="microphone", type="filepath", label = "请用麦克风上传您喜欢的声音,与wav文件二选一即可"), |
], |
outputs=[gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), "audio"], |
title="🥳💬💕 - TalktoAI,随时随地,谈天说地!" |
theme="huggingface", |
description = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!", |
) |
demo.launch(show_error = True) |