Spaces:
Sleeping
Sleeping
PTB_UNESCAPE_MAPPING = { | |
"«": '"', | |
"»": '"', | |
"‘": "'", | |
"’": "'", | |
"“": '"', | |
"”": '"', | |
"„": '"', | |
"‹": "'", | |
"›": "'", | |
"\u2013": "--", # en dash | |
"\u2014": "--", # em dash | |
} | |
NO_SPACE_BEFORE = {"-RRB-", "-RCB-", "-RSB-", "''"} | set("%.,!?:;") | |
NO_SPACE_AFTER = {"-LRB-", "-LCB-", "-LSB-", "``", "`"} | set("$#") | |
NO_SPACE_BEFORE_TOKENS_ENGLISH = {"'", "'s", "'ll", "'re", "'d", "'m", "'ve"} | |
PTB_DASH_ESCAPED = {"-RRB-", "-RCB-", "-RSB-", "-LRB-", "-LCB-", "-LSB-", "--"} | |
def ptb_unescape(words): | |
cleaned_words = [] | |
for word in words: | |
word = PTB_UNESCAPE_MAPPING.get(word, word) | |
# This un-escaping for / and * was not yet added for the | |
# parser version in https://arxiv.org/abs/1812.11760v1 | |
# and related model releases (e.g. benepar_en2) | |
word = word.replace("\\/", "/").replace("\\*", "*") | |
# Mid-token punctuation occurs in biomedical text | |
word = word.replace("-LSB-", "[").replace("-RSB-", "]") | |
word = word.replace("-LRB-", "(").replace("-RRB-", ")") | |
word = word.replace("-LCB-", "{").replace("-RCB-", "}") | |
word = word.replace("``", '"').replace("`", "'").replace("''", '"') | |
cleaned_words.append(word) | |
return cleaned_words | |
def guess_space_after_non_english(escaped_words): | |
sp_after = [True for _ in escaped_words] | |
for i, word in enumerate(escaped_words): | |
if i > 0 and ( | |
( | |
word.startswith("-") | |
and not any(word.startswith(x) for x in PTB_DASH_ESCAPED) | |
) | |
or any(word.startswith(x) for x in NO_SPACE_BEFORE) | |
or word == "'" | |
): | |
sp_after[i - 1] = False | |
if ( | |
word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED) | |
) or any(word.endswith(x) for x in NO_SPACE_AFTER): | |
sp_after[i] = False | |
return sp_after | |
def guess_space_after(escaped_words, for_english=True): | |
if not for_english: | |
return guess_space_after_non_english(escaped_words) | |
sp_after = [True for _ in escaped_words] | |
for i, word in enumerate(escaped_words): | |
if word.lower() == "n't" and i > 0: | |
sp_after[i - 1] = False | |
elif word.lower() == "not" and i > 0 and escaped_words[i - 1].lower() == "can": | |
sp_after[i - 1] = False | |
if i > 0 and ( | |
( | |
word.startswith("-") | |
and not any(word.startswith(x) for x in PTB_DASH_ESCAPED) | |
) | |
or any(word.startswith(x) for x in NO_SPACE_BEFORE) | |
or word.lower() in NO_SPACE_BEFORE_TOKENS_ENGLISH | |
): | |
sp_after[i - 1] = False | |
if ( | |
word.endswith("-") and not any(word.endswith(x) for x in PTB_DASH_ESCAPED) | |
) or any(word.endswith(x) for x in NO_SPACE_AFTER): | |
sp_after[i] = False | |
return sp_after | |