test / src /tts_sentence_parsing.py
iblfe's picture
Upload folder using huggingface_hub
b585c7f verified
raw
history blame
8.2 kB
import textwrap
import re
from src.utils import flatten_list, have_emoji, have_langid
def setup_nltk():
import nltk # we'll use this to split into sentences
nltk.download("punkt")
# if followed installation, then should already be done, don't break air-gap
# setup_nltk()
sentence_keys = ['sentence_list', 'index']
def init_sentence_state():
sentence_state = dict(sentence_list=[], index=0)
return sentence_state
def unpack_state(sentence_state):
rets = []
for key in sentence_keys:
rets.append(sentence_state[key])
return tuple(rets)
def pack_state(sentence_state, *args):
# don't change dict reference so parent can reuse. Ok to lose reference for list
for keyi, key in enumerate(sentence_keys):
if isinstance(sentence_state[key], list):
sentence_state[key] = args[keyi]
else:
sentence_state[key] = args[keyi]
return sentence_state
def split_sentences(sentence, n=250):
"""
Splits a sentence by spaces into smaller sentences, each with a maximum length of n characters,
while preserving whitespace characters like new lines.
# 250 due to [!] Warning: The text length exceeds the character limit of 250 for language 'en', this might cause truncated audio.
"""
# Splitting on spaces while preserving all whitespace characters in a list
words = re.split('(\s+)', sentence)
sentences = []
current_sentence = []
current_length = 0
for word in words:
# Skip empty strings which can occur due to consecutive whitespace
if word == '':
continue
# Check if the word is a whitespace character
if word.isspace():
if word == '\n':
# If it's a newline, end the current sentence and start a new one
sentences.append("".join(current_sentence))
current_sentence = []
current_length = 0
else:
# For other whitespace characters, add them to the current sentence
current_sentence.append(word)
current_length += len(word)
else:
# Check if adding the next word would exceed the limit
if current_length + len(word) > n:
if current_sentence:
sentences.append("".join(current_sentence))
current_sentence = [word]
current_length = len(word)
else:
# If the word itself is longer than n and there's no current sentence
sentences.append(word)
current_length = 0
else:
current_sentence.append(word)
current_length += len(word)
# Add the last sentence if it exists
if current_sentence:
sentences.append("".join(current_sentence))
return sentences
def _get_sentences(response, verbose=False, min_start=15, max_length=250):
# no mutations of characters allowed here, only breaking apart or merging
import nltk
# refuse to tokenize first 15 characters into sentence, so language detection works and logic simpler
sentences = nltk.sent_tokenize(response[min_start:])
# split any long sentences
sentences = flatten_list([split_sentences(x, max_length) for x in sentences])
# drop empty sentences
sentences = [x for x in sentences if x.strip()]
# restore first min_start if set
if sentences and min_start > 0:
sentences[0] = response[:min_start] + sentences[0]
elif min_start > 0:
sentences.append(response[:min_start])
return sentences
def get_sentence(response, sentence_state, is_final=False, verbose=False):
# get state items
sentence_list, index = unpack_state(sentence_state)
sentences = _get_sentences(response[index:], min_start=15 if index == 0 else 0, verbose=verbose)
if len(sentences) >= 2:
# detected new completed sentence
# find new index
index_delta = response[index:].index(sentences[0])
index += index_delta + len(sentences[0])
sentence_list.append(sentences[0])
# only clean for result, to avoid mis-handling of sentences index
cleaned_sentence = clean_sentence(sentences[0], verbose=verbose)
return cleaned_sentence, pack_state(sentence_state, sentence_list, index), False
elif is_final:
# then just return last sentence
cleaned_sentence = clean_sentence(' '.join(sentences), verbose=verbose)
sentence_list.append(' '.join(sentences))
return cleaned_sentence, pack_state(sentence_state, sentence_list, index), True
else:
return None, pack_state(sentence_state, sentence_list, index), True
def clean_sentence(sentence, verbose=False):
if sentence is None or len(sentence) == 0:
if verbose:
print("empty sentence")
return ''
# Remove code blocks
sentence = re.sub("```.*?```", "", sentence, flags=re.DOTALL)
sentence = re.sub("`.*?`", "", sentence, flags=re.DOTALL)
sentence = re.sub("\(.*?\)", "", sentence, flags=re.DOTALL)
# remove marks
sentence = sentence.replace("```", "")
sentence = sentence.replace("...", " ")
sentence = sentence.replace("(", " ")
sentence = sentence.replace(")", " ")
sentence = sentence.replace("Dr. ", "Doctor ")
sentence = sentence.replace(" w/ ", " with ")
sentence = sentence.replace('H2O.ai', "aych two oh ae eye.")
sentence = sentence.replace('H2O.AI', "aych two oh ae eye.")
sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")
sentence = sentence.replace('h2o.ai', "aych two oh ae eye.")
# filter out emojis
if have_emoji:
import emoji
sentence = ''.join([x for x in sentence if not emoji.is_emoji(x)])
# fix floating expressions
sentence = re.sub(r'(\d+)\.(\d+)', r"\1 dot \2", sentence)
# Fix last bad characters
sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)", r"\1\2", sentence)
sentence = sentence.strip()
if sentence.startswith('. ') or sentence.startswith('? ') or sentence.startswith('! ') or sentence.startswith(', '):
sentence = sentence[2:]
if sentence.startswith('.') or sentence.startswith('?') or sentence.startswith('!') or sentence.startswith(','):
sentence = sentence[1:]
if sentence == '1.':
sentence = 'One'
if sentence == '2.':
sentence = 'Two'
if sentence == '3.':
sentence = 'Three'
if sentence == '4.':
sentence = 'Four'
if sentence == '5.':
sentence = 'Five'
if sentence == '6.':
sentence = 'Six'
if sentence == '7.':
sentence = 'Seven'
if sentence == '8.':
sentence = 'Eight'
if sentence == '9.':
sentence = 'Nine'
if sentence == '10.':
sentence = 'Ten'
if len(sentence) == 0:
if verbose:
print("EMPTY SENTENCE after processing")
return ''
if verbose:
print("Sentence for speech: %s" % sentence)
return sentence
def detect_language(prompt, supported_languages, verbose=False):
if not have_langid:
# if no package, just return english
return "en"
import langid
# Fast language autodetection
if len(prompt) > 15:
language_predicted = langid.classify(prompt)[0].strip() # strip need as there is space at end!
if language_predicted == "zh":
# we use zh-cn on xtts
language_predicted = "zh-cn"
if language_predicted not in supported_languages:
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
language = "en"
else:
language = language_predicted
if verbose:
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
else:
# Hard to detect language fast in short sentence, use english default
language = "en"
if verbose:
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
return language