File size: 1,118 Bytes
7cfca48 09f9c26 7cfca48 1809a17 7cfca48 1809a17 7cfca48 1809a17 7cfca48 31bf2aa a32918a 31bf2aa a32918a 31bf2aa a32918a 31bf2aa a32918a 7cfca48 c92ce97 09f9c26 c92ce97 c36ebf7 c92ce97 7cfca48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from hazm import word_tokenize
from hazm import sent_tokenize
import re
import six
from normalizer import normalize
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
text = text.replace(" ", "")
return (len(candidate_text) / len(text)) > ratio
def filter_by_num_tokens(text, gt=64):
return len(word_tokenize(text)) > gt
def filter_by_num_sents(text, gt=2):
return len(sent_tokenize(text)) > gt
def filter_by_adv(text, ratio=50):
comma = text.split(",")
colon = re.findall(r"""(?:([^\W]+):([^\W]+))""", text)
virgool = text.split("،")
length_add = len(comma) + len(colon) + len(virgool)
return length_add < ratio
def normalizer(text, do_lowercase=False):
text = normalize(text)
if do_lowercase:
text = text.lower()
return text
|