enhg-parsing / benepar /ptb_unescape.py
nielklug's picture
add parsing
8778cfe
PTB_UNESCAPE_MAPPING = {
"«": '"',
"»": '"',
"‘": "'",
"’": "'",
"“": '"',
"”": '"',
"„": '"',
"‹": "'",
"›": "'",
"\u2013": "--", # en dash
"\u2014": "--", # em dash
}
NO_SPACE_BEFORE = {"-RRB-", "-RCB-", "-RSB-", "''"} | set("%.,!?:;")
NO_SPACE_AFTER = {"-LRB-", "-LCB-", "-LSB-", "``", "`"} | set("$#")
NO_SPACE_BEFORE_TOKENS_ENGLISH = {"'", "'s", "'ll", "'re", "'d", "'m", "'ve"}
PTB_DASH_ESCAPED = {"-RRB-", "-RCB-", "-RSB-", "-LRB-", "-LCB-", "-LSB-", "--"}
def ptb_unescape(words):
cleaned_words = []
for word in words:
word = PTB_UNESCAPE_MAPPING.get(word, word)
# This un-escaping for / and * was not yet added for the
# parser version in https://arxiv.org/abs/1812.11760v1
# and related model releases (e.g. benepar_en2)
word = word.replace("\\/", "/").replace("\\*", "*")
# Mid-token punctuation occurs in biomedical text
word = word.replace("-LSB-", "[").replace("-RSB-", "]")
word = word.replace("-LRB-", "(").replace("-RRB-", ")")
word = word.replace("-LCB-", "{").replace("-RCB-", "}")
word = word.replace("``", '"').replace("`", "'").replace("''", '"')
cleaned_words.append(word)
return cleaned_words
def guess_space_after_non_english(escaped_words):
sp_after = [True for _ in escaped_words]
for i, word in enumerate(escaped_words):
if i > 0 and (
(
word.startswith("-")
and not any(word.startswith(x) for x in PTB_DASH_ESCAPED)
)
or any(word.startswith(x) for x in NO_SPACE_BEFORE)
or word == "'"
):
sp_after[i - 1] = False
if (
word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED)
) or any(word.endswith(x) for x in NO_SPACE_AFTER):
sp_after[i] = False
return sp_after
def guess_space_after(escaped_words, for_english=True):
if not for_english:
return guess_space_after_non_english(escaped_words)
sp_after = [True for _ in escaped_words]
for i, word in enumerate(escaped_words):
if word.lower() == "n't" and i > 0:
sp_after[i - 1] = False
elif word.lower() == "not" and i > 0 and escaped_words[i - 1].lower() == "can":
sp_after[i - 1] = False
if i > 0 and (
(
word.startswith("-")
and not any(word.startswith(x) for x in PTB_DASH_ESCAPED)
)
or any(word.startswith(x) for x in NO_SPACE_BEFORE)
or word.lower() in NO_SPACE_BEFORE_TOKENS_ENGLISH
):
sp_after[i - 1] = False
if (
word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED)
) or any(word.endswith(x) for x in NO_SPACE_AFTER):
sp_after[i] = False
return sp_after