jlondonobo
/

whisper-large-v2-pt

Automatic Speech Recognition

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

whisper-large-v2-pt / portuguese_normalizer.py

jlondonobo's picture

Training in progress, step 1000

f11c9c4 almost 2 years ago

2.7 kB

	# Modified from OpenAI's Whisper english_normalizer.

	import re
	import unicodedata
	from typing import Iterable

	# non-ASCII letters that are not separated by "NFKD" normalization
	ADDITIONAL_DIACRITICS = {
	"œ": "oe",
	"Œ": "OE",
	"ø": "o",
	"Ø": "O",
	"æ": "ae",
	"Æ": "AE",
	"ß": "ss",
	"ẞ": "SS",
	"đ": "d",
	"Đ": "D",
	"ð": "d",
	"Ð": "D",
	"þ": "th",
	"Þ": "th",
	"ł": "l",
	"Ł": "L",
	}

	PORTUGUESE_ACCENTED_CHARACTERS = [
	"ç",
	"á",
	"é",
	"í",
	"ó",
	"ú",
	"â",
	"ê",
	"ô",
	"ã",
	"õ",
	"à",
	"ò",
	"è",
	"ì",
	"ù"
	]

	PORTUGUESE_DIACRITICS = ['̧', '̂', '̀', '̃', '́']


	def remove_symbols_and_diacritics(s: str, keep: Iterable[str] = "") -> str:
	"""
	Replace any other markers, symbols, and punctuations with a space,
	and drop any diacritics (category 'Mn' and some manual mappings)
	"""
	return "".join(
	c
	if c in keep
	else ADDITIONAL_DIACRITICS[c]
	if c in ADDITIONAL_DIACRITICS
	else ""
	if unicodedata.category(c) == "Mn"
	else " "
	if unicodedata.category(c)[0] in "MSP"
	else c
	for c in unicodedata.normalize("NFKD", s)
	)


	class PortugueseTextNormalizer:
	def __init__(self):
	self.ignore_patterns = r"\b(hmm\|mm\|mhm\|mmm\|uh)\b"
	self.replacers = {
	# contractions in titles/prefixes
	r"\bsr\b": "senhor ",
	r"\bsra\b": "senhora ",
	r"\bsto\b": "santo ",
	r"\bsta\b": "santa ",
	r"\bdr\b": "doutor ",
	r"\bdra\b": "doutora ",
	r"\bprof\b": "professor ",
	r"\bcap\b": "capitão ",
	}

	def __call__(self, s: str):
	s = s.lower()

	s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
	s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
	s = re.sub(self.ignore_patterns, "", s)

	for pattern, replacement in self.replacers.items():
	s = re.sub(pattern, replacement, s)

	# In english, one wold remove commas between digits (thousands separators)
	# and periods not followed by digits (decimals). But in portuguese, either comma or period
	# can be used as a decimal separator.
	s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
	s = re.sub(r"(\d)\.(\d)", r"\1\2", s) # remove periods between digits

	s = remove_symbols_and_diacritics(s, keep=PORTUGUESE_DIACRITICS)

	s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space

	return s.lower()