Spaces:

nielklug
/

enhg-parsing

Sleeping

App Files Files Community

enhg-parsing / benepar /ptb_unescape.py

nielklug

add parsing

8778cfe 7 months ago

raw

history blame contribute delete

2.92 kB

	PTB_UNESCAPE_MAPPING = {
	"«": '"',
	"»": '"',
	"‘": "'",
	"’": "'",
	"“": '"',
	"”": '"',
	"„": '"',
	"‹": "'",
	"›": "'",
	"\u2013": "--", # en dash
	"\u2014": "--", # em dash
	}

	NO_SPACE_BEFORE = {"-RRB-", "-RCB-", "-RSB-", "''"} \| set("%.,!?:;")
	NO_SPACE_AFTER = {"-LRB-", "-LCB-", "-LSB-", "``", "`"} \| set("$#")
	NO_SPACE_BEFORE_TOKENS_ENGLISH = {"'", "'s", "'ll", "'re", "'d", "'m", "'ve"}
	PTB_DASH_ESCAPED = {"-RRB-", "-RCB-", "-RSB-", "-LRB-", "-LCB-", "-LSB-", "--"}


	def ptb_unescape(words):
	cleaned_words = []
	for word in words:
	word = PTB_UNESCAPE_MAPPING.get(word, word)
	# This un-escaping for / and * was not yet added for the
	# parser version in https://arxiv.org/abs/1812.11760v1
	# and related model releases (e.g. benepar_en2)
	word = word.replace("\\/", "/").replace("\\", "")
	# Mid-token punctuation occurs in biomedical text
	word = word.replace("-LSB-", "[").replace("-RSB-", "]")
	word = word.replace("-LRB-", "(").replace("-RRB-", ")")
	word = word.replace("-LCB-", "{").replace("-RCB-", "}")
	word = word.replace("``", '"').replace("`", "'").replace("''", '"')
	cleaned_words.append(word)
	return cleaned_words


	def guess_space_after_non_english(escaped_words):
	sp_after = [True for _ in escaped_words]
	for i, word in enumerate(escaped_words):
	if i > 0 and (
	(
	word.startswith("-")
	and not any(word.startswith(x) for x in PTB_DASH_ESCAPED)
	)
	or any(word.startswith(x) for x in NO_SPACE_BEFORE)
	or word == "'"
	):
	sp_after[i - 1] = False
	if (
	word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED)
	) or any(word.endswith(x) for x in NO_SPACE_AFTER):
	sp_after[i] = False

	return sp_after


	def guess_space_after(escaped_words, for_english=True):
	if not for_english:
	return guess_space_after_non_english(escaped_words)

	sp_after = [True for _ in escaped_words]
	for i, word in enumerate(escaped_words):
	if word.lower() == "n't" and i > 0:
	sp_after[i - 1] = False
	elif word.lower() == "not" and i > 0 and escaped_words[i - 1].lower() == "can":
	sp_after[i - 1] = False

	if i > 0 and (
	(
	word.startswith("-")
	and not any(word.startswith(x) for x in PTB_DASH_ESCAPED)
	)
	or any(word.startswith(x) for x in NO_SPACE_BEFORE)
	or word.lower() in NO_SPACE_BEFORE_TOKENS_ENGLISH
	):
	sp_after[i - 1] = False
	if (
	word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED)
	) or any(word.endswith(x) for x in NO_SPACE_AFTER):
	sp_after[i] = False

	return sp_after