Spaces:

liuhaozhe6788
/

CelebChat

Running

CelebChat / run_tts.py

lhzstar

new commits

edcdcdb 8 months ago

No virus

4.75 kB

	import argparse
	from ctypes import alignment
	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
	import sys
	sys.path.append('rtvc/')
	from pathlib import Path
	import time
	import spacy
	import matplotlib.pyplot as plt

	import librosa
	import numpy as np
	import soundfile as sf
	import torch
	import noisereduce as nr
	import io
	from scipy.io.wavfile import write
	import base64
	import streamlit as st


	from rtvc.encoder import inference as encoder
	from rtvc.encoder.params_data import *
	from rtvc.encoder.params_model import model_embedding_size as speaker_embedding_size
	from rtvc.synthesizer.inference import Synthesizer_infer
	from rtvc.utils.argutils import print_args
	from rtvc.utils.default_models import ensure_default_models
	from rtvc.vocoder import inference as vocoder
	from rtvc.vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens
	from rtvc.synthesizer.utils.cleaners import english_cleaners_predict
	from rtvc.speed_changer.fixSpeed import *


	def tts(text, embed_name, nlp, autoplay=True):
	run_id = "default"
	models_dir = Path("rtvc/saved_models")
	embed_path = f"embeds/{embed_name}.npy"
	if torch.cuda.is_available():
	device_id = torch.cuda.current_device()
	gpu_properties = torch.cuda.get_device_properties(device_id)

	ensure_default_models(run_id, models_dir)
	synthesizer = Synthesizer_infer(list(models_dir.glob(f"{run_id}/synthesizer.pt"))[0])
	# vocoder.load_model(list(models_dir.glob(f"{run_id}/vocoder.pt"))[0])
	## Generating the spectrogram

	# The synthesizer works in batch, so you need to put your data in a list or numpy array
	def split_text(text):
	text = english_cleaners_predict(text)
	texts = [i.text.strip() for i in nlp(text).sents] # split paragraph to sentences
	return texts

	texts = split_text(text)
	print(f"the list of inputs texts:\n{texts}")

	embed = np.load(embed_path)
	specs = []
	alignments = []
	stop_tokens = []

	for text in texts:
	spec, align, stop_token = synthesizer.synthesize_spectrograms([text], [embed], require_visualization=True)
	specs.append(spec[0])
	alignments.append(align[0])
	stop_tokens.append(stop_token[0])

	breaks = [spec.shape[1] for spec in specs]
	spec = np.concatenate(specs, axis=1)

	## Save synthesizer visualization results
	if not os.path.exists("syn_results"):
	os.mkdir("syn_results")
	# save_attention_multiple(alignments, "syn_results/attention")
	# save_stop_tokens(stop_tokens, "syn_results/stop_tokens")
	# save_spectrogram(spec, "syn_results/mel")
	print("Created the mel spectrogram")


	## Generating the waveform
	print("Synthesizing the waveform:")

	# Synthesizing the waveform is fairly straightforward. Remember that the longer the
	# spectrogram, the more time-efficient the vocoder.
	# wav = vocoder.infer_waveform(spec)
	wav = synthesizer.griffin_lim(spec)
	wav = vocoder.waveform_denoising(wav)

	# Add breaks
	b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
	b_starts = np.concatenate(([0], b_ends[:-1]))
	wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
	breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
	wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

	# Trim excess silences to compensate for gaps in spectrograms (issue #53)
	# generated_wav = encoder.preprocess_wav(generated_wav)
	wav = wav / np.abs(wav).max() * 10

	if autoplay:
	# Play the audio (non-blocking)
	import sounddevice as sd
	try:
	sd.stop()
	sd.play(wav, synthesizer.sample_rate)
	time_span = len(wav)//synthesizer.sample_rate + 1
	time.sleep(time_span)

	except sd.PortAudioError as e:
	print("\nCaught exception: %s" % repr(e))
	print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
	except:
	raise
	bytes_wav = bytes()
	byte_io = io.BytesIO(bytes_wav)
	write(byte_io, synthesizer.sample_rate, wav.astype(np.float32))
	result_bytes = byte_io.read()
	return base64.b64encode(result_bytes).decode()

	if __name__ == "__main__":
	text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London."
	embed_name = "Helen_Mirren"
	nlp = spacy.load('en_core_web_lg')
	b64 = tts(text, embed_name, nlp, autoplay=False)

	md = f"""
	<audio controls autoplay>
	<source src="data:audio/wav;base64,{b64}" type="audio/wav">
	Your browser does not support the audio element.
	</audio>
	"""
	st.markdown(md, unsafe_allow_html=True)