|
import nltk |
|
import jieba |
|
import sudachipy |
|
import langid |
|
nltk.download('punkt') |
|
langid.set_languages(['en', 'zh', 'ja']) |
|
|
|
def split_text_into_sentences(text): |
|
if langid.classify(text)[0] == "en": |
|
sentences = nltk.tokenize.sent_tokenize(text) |
|
|
|
return sentences |
|
elif langid.classify(text)[0] == "zh": |
|
sentences = [] |
|
segs = jieba.cut(text, cut_all=False) |
|
segs = list(segs) |
|
start = 0 |
|
for i, seg in enumerate(segs): |
|
if seg in ["。", "!", "?", "……"]: |
|
sentences.append("".join(segs[start:i + 1])) |
|
start = i + 1 |
|
if start < len(segs): |
|
sentences.append("".join(segs[start:])) |
|
|
|
return sentences |
|
elif langid.classify(text)[0] == "ja": |
|
sentences = [] |
|
tokenizer = sudachipy.Dictionary().create() |
|
tokens = tokenizer.tokenize(text) |
|
current_sentence = "" |
|
|
|
for token in tokens: |
|
current_sentence += token.surface() |
|
if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点": |
|
sentences.append(current_sentence) |
|
current_sentence = "" |
|
|
|
if current_sentence: |
|
sentences.append(current_sentence) |
|
|
|
return sentences |
|
|
|
raise RuntimeError("It is impossible to reach here.") |