Spaces:
Running
on
L4
Running
on
L4
import nltk | |
import jieba | |
import sudachipy | |
import langid | |
nltk.download('punkt') | |
langid.set_languages(['en', 'zh', 'ja']) | |
def split_text_into_sentences(text): | |
if langid.classify(text)[0] == "en": | |
sentences = nltk.tokenize.sent_tokenize(text) | |
return sentences | |
elif langid.classify(text)[0] == "zh": | |
sentences = [] | |
segs = jieba.cut(text, cut_all=False) | |
segs = list(segs) | |
start = 0 | |
for i, seg in enumerate(segs): | |
if seg in ["。", "!", "?", "……"]: | |
sentences.append("".join(segs[start:i + 1])) | |
start = i + 1 | |
if start < len(segs): | |
sentences.append("".join(segs[start:])) | |
return sentences | |
elif langid.classify(text)[0] == "ja": | |
sentences = [] | |
tokenizer = sudachipy.Dictionary().create() | |
tokens = tokenizer.tokenize(text) | |
current_sentence = "" | |
for token in tokens: | |
current_sentence += token.surface() | |
if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点": | |
sentences.append(current_sentence) | |
current_sentence = "" | |
if current_sentence: | |
sentences.append(current_sentence) | |
return sentences | |
raise RuntimeError("It is impossible to reach here.") |