voice-matcher-api / app /string_processor.py
arnabg95's picture
v2
fe79a8f
raw
history blame
686 Bytes
import unicodedata
import re
def clean_transcription(text):
# Normalize the text to NFKD form
normalized_text = unicodedata.normalize('NFKD', text)
# Remove diacritics
cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
# Explicitly remove the leading ʻ character and any other specific characters
cleaned_text = cleaned_text.replace('ʻ', '')
# Remove any remaining special characters (if any)
cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
# Ensure the text is stripped of any unwanted leading or trailing whitespace
cleaned_text = cleaned_text.strip()
return cleaned_text