Spaces:
Runtime error
Runtime error
HugoLaurencon
commited on
Commit
·
6f25c5c
1
Parent(s):
d463071
new tool to analyse our own doc
Browse files- .gitignore +2 -0
- app.py +132 -4
- parameters_filtering.py +2 -2
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*cpython-39.pyc
|
2 |
+
.DS_Store
|
app.py
CHANGED
@@ -13,7 +13,7 @@ import numpy as np
|
|
13 |
|
14 |
import matplotlib.pyplot as plt
|
15 |
|
16 |
-
from filtering import Filtering
|
17 |
|
18 |
|
19 |
class Visualization:
|
@@ -25,6 +25,10 @@ class Visualization:
|
|
25 |
num_docs,
|
26 |
num_docs_for_words,
|
27 |
max_len_text_display,
|
|
|
|
|
|
|
|
|
28 |
):
|
29 |
self.path_instructions = path_instructions
|
30 |
self.path_data = path_data
|
@@ -33,6 +37,23 @@ class Visualization:
|
|
33 |
self.num_docs_for_words = num_docs_for_words
|
34 |
self.max_len_text_display = max_len_text_display
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
def preamble(self):
|
37 |
st.markdown(
|
38 |
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
|
@@ -159,6 +180,7 @@ class Visualization:
|
|
159 |
"repetitions_ratio",
|
160 |
cutoff_repetitions_ratio,
|
161 |
True,
|
|
|
162 |
)
|
163 |
keys.append(new_key)
|
164 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
@@ -392,8 +414,104 @@ class Visualization:
|
|
392 |
ax.set_ylabel("frequency in the documents")
|
393 |
st.pyplot(fig)
|
394 |
|
395 |
-
def
|
396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
|
398 |
def download_data(self):
|
399 |
st.header("Download data")
|
@@ -413,7 +531,7 @@ class Visualization:
|
|
413 |
self.filtering_of_words()
|
414 |
self.plot_distributions_filtering_parameters()
|
415 |
#self.plot_zipf_law()
|
416 |
-
self.
|
417 |
self.download_data()
|
418 |
|
419 |
|
@@ -424,6 +542,12 @@ num_docs = 5000
|
|
424 |
num_docs_for_words = 500
|
425 |
max_len_text_display = 10000
|
426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
visualization = Visualization(
|
428 |
path_instructions,
|
429 |
path_data,
|
@@ -431,5 +555,9 @@ visualization = Visualization(
|
|
431 |
num_docs,
|
432 |
num_docs_for_words,
|
433 |
max_len_text_display,
|
|
|
|
|
|
|
|
|
434 |
)
|
435 |
visualization.visualization()
|
|
|
13 |
|
14 |
import matplotlib.pyplot as plt
|
15 |
|
16 |
+
from filtering import LoadParameters, ModifyingDocuments, Filtering
|
17 |
|
18 |
|
19 |
class Visualization:
|
|
|
25 |
num_docs,
|
26 |
num_docs_for_words,
|
27 |
max_len_text_display,
|
28 |
+
lang_dataset_id,
|
29 |
+
path_fasttext_model,
|
30 |
+
path_sentencepiece_model,
|
31 |
+
path_kenlm_model,
|
32 |
):
|
33 |
self.path_instructions = path_instructions
|
34 |
self.path_data = path_data
|
|
|
37 |
self.num_docs_for_words = num_docs_for_words
|
38 |
self.max_len_text_display = max_len_text_display
|
39 |
|
40 |
+
self.lang_dataset_id = lang_dataset_id
|
41 |
+
self.param = LoadParameters.load_parameters(lang_dataset_id)
|
42 |
+
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
|
43 |
+
self.badwords = LoadParameters.load_badwords(lang_dataset_id)
|
44 |
+
self.model_lang_id = LoadParameters.load_model_lang_id(
|
45 |
+
lang_dataset_id, path_fasttext_model
|
46 |
+
)
|
47 |
+
self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
|
48 |
+
lang_dataset_id, path_sentencepiece_model
|
49 |
+
)
|
50 |
+
self.sentencepiece_model_tok = (
|
51 |
+
self.sentencepiece_model if self.param["tokenization"] else None
|
52 |
+
)
|
53 |
+
self.kenlm_model = LoadParameters.load_kenlm_model(
|
54 |
+
lang_dataset_id, path_kenlm_model
|
55 |
+
)
|
56 |
+
|
57 |
def preamble(self):
|
58 |
st.markdown(
|
59 |
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
|
|
|
180 |
"repetitions_ratio",
|
181 |
cutoff_repetitions_ratio,
|
182 |
True,
|
183 |
+
repetitions_length,
|
184 |
)
|
185 |
keys.append(new_key)
|
186 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
414 |
ax.set_ylabel("frequency in the documents")
|
415 |
st.pyplot(fig)
|
416 |
|
417 |
+
def analyse_personal_doc(self):
|
418 |
+
st.header("Analyse your own document")
|
419 |
+
|
420 |
+
personal_doc = st.text_area(
|
421 |
+
label="Paste here the document you want to analyse",
|
422 |
+
value="",
|
423 |
+
max_chars=10000,
|
424 |
+
)
|
425 |
+
|
426 |
+
is_discarded = False
|
427 |
+
|
428 |
+
def is_doc_discarded(key, score):
|
429 |
+
if key[2]: # max cutoff
|
430 |
+
return score > key[1]
|
431 |
+
else:
|
432 |
+
return score < key[1]
|
433 |
+
|
434 |
+
for key in self.keys:
|
435 |
+
if key[0] == "number_words":
|
436 |
+
words = ModifyingDocuments.get_words_from_document(
|
437 |
+
personal_doc,
|
438 |
+
self.sentencepiece_model_tok,
|
439 |
+
lower_case=False,
|
440 |
+
strip_characters=self.param["strip_characters"],
|
441 |
+
)
|
442 |
+
if key[2]:
|
443 |
+
st.markdown(f"Number of words: {len(words)}")
|
444 |
+
if is_doc_discarded(key, len(words)):
|
445 |
+
is_discarded = True
|
446 |
+
|
447 |
+
elif key[0] == "repetitions_ratio":
|
448 |
+
repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
|
449 |
+
repetitions_ratio = round(repetitions_ratio, 3)
|
450 |
+
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
|
451 |
+
if is_doc_discarded(key, repetitions_ratio):
|
452 |
+
is_discarded = True
|
453 |
+
|
454 |
+
elif key[0] == "special_characters_ratio":
|
455 |
+
special_characters_ratio = Filtering.compute_special_characters_ratio(
|
456 |
+
personal_doc, self.param["special_characters"]
|
457 |
+
)
|
458 |
+
special_characters_ratio = round(special_characters_ratio, 3)
|
459 |
+
st.markdown(f"Special characters ratio: {special_characters_ratio}")
|
460 |
+
if is_doc_discarded(key, special_characters_ratio):
|
461 |
+
is_discarded = True
|
462 |
+
|
463 |
+
elif key[0] == "stopwords_ratio":
|
464 |
+
stopwords_ratio = Filtering.compute_stopwords_ratio(
|
465 |
+
personal_doc,
|
466 |
+
self.sentencepiece_model_tok,
|
467 |
+
self.param["strip_characters"],
|
468 |
+
self.param["cond_words_augmentation"],
|
469 |
+
self.param["words_augmentation_group_sizes"],
|
470 |
+
self.param["words_augmentation_join_char"],
|
471 |
+
self.stopwords,
|
472 |
+
)
|
473 |
+
stopwords_ratio = round(stopwords_ratio, 3)
|
474 |
+
st.markdown(f"Stop words ratio: {stopwords_ratio}")
|
475 |
+
if is_doc_discarded(key, stopwords_ratio):
|
476 |
+
is_discarded = True
|
477 |
+
|
478 |
+
elif key[0] == "badwords_ratio":
|
479 |
+
badwords_ratio = Filtering.compute_badwords_ratio(
|
480 |
+
personal_doc,
|
481 |
+
self.sentencepiece_model_tok,
|
482 |
+
self.param["strip_characters"],
|
483 |
+
self.param["cond_words_augmentation"],
|
484 |
+
self.param["words_augmentation_group_sizes"],
|
485 |
+
self.param["words_augmentation_join_char"],
|
486 |
+
self.badwords,
|
487 |
+
)
|
488 |
+
badwords_ratio = round(badwords_ratio, 3)
|
489 |
+
st.markdown(f"Flagged words ratio: {badwords_ratio}")
|
490 |
+
if is_doc_discarded(key, badwords_ratio):
|
491 |
+
is_discarded = True
|
492 |
+
|
493 |
+
elif key[0] == "lang_id_score":
|
494 |
+
lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
|
495 |
+
personal_doc, self.model_lang_id
|
496 |
+
)
|
497 |
+
lang_id_score = round(lang_id_score, 3)
|
498 |
+
st.markdown(f"Language identification confidence score: {lang_id_score}")
|
499 |
+
if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
|
500 |
+
is_discarded = True
|
501 |
+
|
502 |
+
elif key[0] == "perplexity_score":
|
503 |
+
perplexity_score = Filtering.compute_perplexity_score(
|
504 |
+
personal_doc,
|
505 |
+
self.sentencepiece_model,
|
506 |
+
self.kenlm_model,
|
507 |
+
)
|
508 |
+
perplexity_score = round(perplexity_score, 3)
|
509 |
+
st.markdown(f"Perplexity score: {perplexity_score}")
|
510 |
+
if is_doc_discarded(key, perplexity_score):
|
511 |
+
is_discarded = True
|
512 |
+
|
513 |
+
is_discarded = "" if is_discarded else "not "
|
514 |
+
st.markdown(f"With the current filtering parameters, this document is {is_discarded}discarded.")
|
515 |
|
516 |
def download_data(self):
|
517 |
st.header("Download data")
|
|
|
531 |
self.filtering_of_words()
|
532 |
self.plot_distributions_filtering_parameters()
|
533 |
#self.plot_zipf_law()
|
534 |
+
self.analyse_personal_doc()
|
535 |
self.download_data()
|
536 |
|
537 |
|
|
|
542 |
num_docs_for_words = 500
|
543 |
max_len_text_display = 10000
|
544 |
|
545 |
+
# Only useful for analyse_personal_doc
|
546 |
+
lang_dataset_id = "en"
|
547 |
+
path_fasttext_model = "./lid.176.bin"
|
548 |
+
path_sentencepiece_model = "./en.sp.model"
|
549 |
+
path_kenlm_model = "./en.arpa.bin"
|
550 |
+
|
551 |
visualization = Visualization(
|
552 |
path_instructions,
|
553 |
path_data,
|
|
|
555 |
num_docs,
|
556 |
num_docs_for_words,
|
557 |
max_len_text_display,
|
558 |
+
lang_dataset_id,
|
559 |
+
path_fasttext_model,
|
560 |
+
path_sentencepiece_model,
|
561 |
+
path_kenlm_model,
|
562 |
)
|
563 |
visualization.visualization()
|
parameters_filtering.py
CHANGED
@@ -7,8 +7,8 @@ other_special_characters = (
|
|
7 |
" ’“”–ー一▬…✦�£•€«»°·═"
|
8 |
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
|
9 |
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
|
10 |
-
"
|
11 |
-
"
|
12 |
)
|
13 |
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
|
14 |
|
|
|
7 |
" ’“”–ー一▬…✦�£•€«»°·═"
|
8 |
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
|
9 |
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
|
10 |
+
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
|
11 |
+
"」﴾》"
|
12 |
)
|
13 |
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
|
14 |
|