vmoras's picture
Fix folder paths
4df6e8a
import re
import os
import nltk
import torch
import pickle
import torchaudio
import numpy as np
from TTS.tts.models.xtts import Xtts
from nltk.tokenize import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig
def _load_array(filename):
""" Opens a file a returns it, used with numpy files """
with open(filename, 'rb') as f:
return pickle.load(f)
os.environ['COQUI_TOS_AGREED'] = '1'
# Used to generate audio based on a sample
nltk.download('punkt')
model_path = os.path.join("tts_model")
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(
config,
checkpoint_path=os.path.join(model_path, "model.pth"),
vocab_path=os.path.join(model_path, "vocab.json"),
eval=True,
use_deepspeed=True,
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
# Speaker latent
path_latents = 'assets/gpt_cond_latent.npy'
gpt_cond_latent = _load_array(path_latents)
# Speaker embedding
path_embedding = 'assets/speaker_embedding.npy'
speaker_embedding = _load_array(path_embedding)
def get_audio(text: str, language: str = 'es', saving_path: str = 'output') -> None:
"""
Creates an audio
:param text: text to convert to audio
:param language: 'es', 'en' or 'pt', language used for the audio file
:param saving_path: path to save the audio
:return: None
"""
# Creates an audio with the answer and saves it as output.wav
_save_audio(text, language, saving_path)
return
def _save_audio(text: str, language: str, path_audio: str) -> None:
"""
Splits the text into sentences, clean and creates an audio for each one, then concatenates
all the audios and saves them into a file.
:param text: input text
:param language: language used in the audio
:param path_audio: saving path of the audio
:return: None
"""
# Split the answer into sentences and clean it
sentences = _get_clean_text(text, language)
# Get the voice of each sentence
audio_segments = []
for sentence in sentences:
audio_stream = _get_voice(sentence, language)
audio_stream = torch.tensor(audio_stream)
audio_segments.append(audio_stream)
# Concatenate and save all audio segments
concatenated_audio = torch.cat(audio_segments, dim=0)
torchaudio.save(f'{path_audio}.wav', concatenated_audio.unsqueeze(0), 24000)
return
def _get_voice(sentence: str, language: str) -> np.ndarray:
"""
Gets a numpy array with a wav of an audio with the given sentence and language
:param sentence: input sentence
:param language: languages used in the audio
:return: numpy array with the audio
"""
out = model.inference(
sentence,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=0.1
)
return out['wav']
def _get_clean_text(text: str, language: str) -> list[str]:
"""
Splits the text into smaller sentences using nltk and removes links.
:param text: input text for the audio
:param language: language used for the audio ('es', 'en', 'pt')
:return: list of sentences
"""
# Remove the links in the audio and add another sentence
if language == 'en':
clean_answer = re.sub(r'http[s]?://\S+', 'the following link', text)
max_characters = 250
elif language == 'es':
clean_answer = re.sub(r'http[s]?://\S+', 'el siguiente link', text)
max_characters = 239
else:
clean_answer = re.sub(r'http[s]?://\S+', 'o seguinte link', text)
max_characters = 203
# Change the name from Bella to Bela
clean_answer = clean_answer.replace('Bella', 'Bela')
# Remove Florida and zipcode
clean_answer = re.sub(r', FL \d+', "", clean_answer)
# Split the answer into sentences with nltk and make sure they are shorter than the maximum possible
# characters
split_sentences = sent_tokenize(clean_answer)
sentences = []
for sentence in split_sentences:
if len(sentence) > max_characters:
sentences.extend(_split_sentence(sentence, max_characters))
else:
sentences.append(sentence)
return sentences
def _split_sentence(sentence: str, max_characters: int) -> list[str]:
"""
Used when the sentences are still to long. The split point is the nearest comma to the middle
of the sentence, if there is no comma then a space is used or just the middle. If the
remaining sentences are still too long, another iteration is run.
:param sentence: sentence to be split
:param max_characters: max number of characters a sentence can have
:return: list of sentences
"""
# Get index of each comma
sentences = []
commas = [i for i, c in enumerate(sentence) if c == ',']
# No commas, search for spaces
if len(commas) == 0:
commas = [i for i, c in enumerate(sentence) if c == ' ']
# No commas or spaces, split it in the middle
if len(commas) == 0:
sentences.append(sentence[:len(sentence) // 2])
sentences.append(sentence[len(sentence) // 2:])
return sentences
# Nearest index to the middle
split_point = min(commas, key=lambda x: abs(x - (len(sentence) // 2)))
if sentence[split_point] == ',':
left = sentence[:split_point]
right = sentence[split_point + 2:]
else:
left = sentence[:split_point]
right = sentence[split_point + 1:]
if len(left) > max_characters:
sentences.extend(_split_sentence(left, max_characters))
else:
sentences.append(left)
if len(right) > max_characters:
sentences.extend(_split_sentence(right, max_characters))
else:
sentences.append(right)
return sentences