|
from hazm import word_tokenize |
|
from hazm import sent_tokenize |
|
import re |
|
import six |
|
import string |
|
|
|
from normalizer import normalize |
|
|
|
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c" |
|
|
|
|
|
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"): |
|
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "") |
|
text = text.replace(" ", "") |
|
|
|
return True if (len(candidate_text) / len(text)) > ratio else False |
|
|
|
|
|
def filter_by_num_tokens(text, gt=64): |
|
return True if len(word_tokenize(text)) > gt else False |
|
|
|
|
|
def filter_by_num_sents(text, gt=2): |
|
return True if len(sent_tokenize(text)) > gt else False |
|
|
|
|
|
def normalizer(text, do_lowercase=False): |
|
text = normalize(text) |
|
|
|
if do_lowercase: |
|
text = text.lower() |
|
|
|
return text |
|
|
|
|
|
|
|
|