File size: 4,904 Bytes
6c702b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import re
import logging
from typing import Set, Tuple, List
SENTENCE_SPLITTER = re.compile(r'[^\.?!…]+[\.?!…]*["»“]*')
LAST_WORD_PATTERN = re.compile(r'(?:\b|\d)([a-zа-я]+)\.$', re.IGNORECASE)
FIRST_WORD_PATTERN = re.compile(r'^\W*(\w+)')
ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN = re.compile(r'(\d|\W|\b)([a-zA-Z])\.$')
HAS_DOT_INSIDE_PATTERN = re.compile(r'[\w]+\.[\w]+\.$', re.IGNORECASE)
INITIALS_PATTERN = re.compile(r'(\W|\b)([A-ZА-Я]{1})\.$')
ONLY_RUS_CONSONANTS_PATTERN = re.compile(r'^[бвгджзйклмнпрстфхцчшщ]{1,4}$', re.IGNORECASE)
STARTS_WITH_EMPTYNESS_PATTERN = re.compile(r'^\s+')
ENDS_WITH_EMOTION_PATTERN = re.compile(r'[!?…]|\.{2,}\s?[)"«»,“]?$')
STARTS_WITH_LOWER_PATTERN = re.compile(r'^\s*[–-—-("«]?\s*[a-zа-я]')
STARTS_WITH_DIGIT_PATTERN = re.compile(r'^\s*\d')
NUMERATION_PATTERN = re.compile(r'^\W*[IVXMCL\d]+\.$')
PAIRED_SHORTENING_IN_THE_END_PATTERN = re.compile(r'\b(\w+)\. (\w+)\.\W*$')
JOIN = 0
MAYBE = 1
SPLIT = 2
JOINING_SHORTENINGS = {
'mr', 'mrs', 'ms', 'dr', 'vs', 'англ', 'итал', 'греч', 'евр', 'араб', 'яп', 'слав', 'кит',
'тел', 'св', 'ул', 'устар', 'им', 'г', 'см', 'д', 'стр', 'корп', 'пл', 'пер', 'сокр', 'рис'
}
SHORTENINGS = {
'co', 'corp', 'inc', 'авт', 'адм', 'барр', 'внутр', 'га', 'дифф', 'дол', 'долл', 'зав', 'зам', 'искл',
'коп', 'корп', 'куб', 'лат', 'мин', 'о', 'обл', 'обр', 'прим', 'проц', 'р', 'ред', 'руб', 'рус', 'русск',
'сан', 'сек', 'тыс', 'эт', 'яз', 'гос', 'мн', 'жен', 'муж', 'накл', 'повел', 'букв', 'шутл', 'ед'
}
PAIRED_SHORTENINGS = {('и', 'о'), ('т', 'е'), ('т', 'п'), ('у', 'е'), ('н', 'э')}
def split_sentences(text: str) -> List[str]:
return [x.strip() for x in SENTENCE_SPLITTER.findall(text)]
def is_sentence_end(left: str, right: str,
shortenings: Set[str],
joining_shortenings: Set[str],
paired_shortenings: Set[Tuple[str, str]]) -> int:
if not STARTS_WITH_EMPTYNESS_PATTERN.match(right):
return JOIN
if HAS_DOT_INSIDE_PATTERN.search(left):
return JOIN
left_last_word = LAST_WORD_PATTERN.search(left)
lw = ' '
if left_last_word:
lw = left_last_word.group(1)
if lw.lower() in joining_shortenings:
return JOIN
if ONLY_RUS_CONSONANTS_PATTERN.search(lw) and lw[-1].islower():
return MAYBE
pse = PAIRED_SHORTENING_IN_THE_END_PATTERN.search(left)
if pse:
s1, s2 = pse.groups()
if (s1, s2) in paired_shortenings:
return MAYBE
right_first_word = FIRST_WORD_PATTERN.match(right)
if right_first_word:
rw = right_first_word.group(1)
if (lw, rw) in paired_shortenings:
return MAYBE
if ENDS_WITH_EMOTION_PATTERN.search(left) and STARTS_WITH_LOWER_PATTERN.match(right):
return JOIN
initials = INITIALS_PATTERN.search(left)
if initials:
border, _ = initials.groups()
if (border or ' ') not in "°'":
return JOIN
if lw.lower() in shortenings:
return MAYBE
last_letter = ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN.search(left)
if last_letter:
border, _ = last_letter.groups()
if (border or ' ') not in "°'":
return MAYBE
if NUMERATION_PATTERN.match(left):
return JOIN
return SPLIT
def split_by_sentences(text: str,
shortenings: Set[str] = SHORTENINGS,
joining_shortenings: Set[str] = JOINING_SHORTENINGS,
paired_shortenings: Set[Tuple[str, str]] = PAIRED_SHORTENINGS) -> List[str]:
sentences = []
sents = split_sentences(text)
si = 0
processed_index = 0
sent_start = 0
while si < len(sents):
s = sents[si]
span_start = text[processed_index:].index(s) + processed_index
span_end = span_start + len(s)
processed_index += len(s)
si += 1
send = is_sentence_end(text[sent_start: span_end], text[span_end:],
shortenings, joining_shortenings, paired_shortenings)
if send == JOIN:
continue
if send == MAYBE:
if STARTS_WITH_LOWER_PATTERN.match(text[span_end:]):
continue
if STARTS_WITH_DIGIT_PATTERN.match(text[span_end:]):
continue
if not text[sent_start: span_end].strip():
print(text)
sentences.append(text[sent_start: span_end].strip())
sent_start = span_end
processed_index = span_end
if sent_start != len(text):
if text[sent_start:].strip():
sentences.append(text[sent_start:].strip())
return sentences |