File size: 1,118 Bytes
7cfca48
 
 
 
 
 
 
 
09f9c26
7cfca48
 
 
 
 
1809a17
7cfca48
 
 
1809a17
7cfca48
 
 
1809a17
7cfca48
31bf2aa
 
a32918a
31bf2aa
a32918a
31bf2aa
a32918a
31bf2aa
a32918a
7cfca48
c92ce97
 
09f9c26
c92ce97
 
c36ebf7
c92ce97
7cfca48
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from hazm import word_tokenize
from hazm import sent_tokenize
import re
import six

from normalizer import normalize

persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"


def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
    candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
    text = text.replace(" ", "")

    return (len(candidate_text) / len(text)) > ratio


def filter_by_num_tokens(text, gt=64):
    return len(word_tokenize(text)) > gt


def filter_by_num_sents(text, gt=2):
    return len(sent_tokenize(text)) > gt


def filter_by_adv(text, ratio=50):
    comma = text.split(",")
    colon = re.findall(r"""(?:([^\W]+):([^\W]+))""", text)
    virgool = text.split("،")
    length_add = len(comma) + len(colon) + len(virgool)

    return length_add < ratio


def normalizer(text, do_lowercase=False):
    text = normalize(text)

    if do_lowercase:
        text = text.lower()

    return text