Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Jan 11, 2022

Commit

6f25c5c

1 Parent(s): d463071

new tool to analyse our own doc

Browse files

Files changed (3) hide show

.gitignore +2 -0
app.py +132 -4
parameters_filtering.py +2 -2

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *cpython-39.pyc
2	+ .DS_Store

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ import numpy as np
 import matplotlib.pyplot as plt
-from filtering import Filtering
 class Visualization:
@@ -25,6 +25,10 @@ class Visualization:
         num_docs,
         num_docs_for_words,
         max_len_text_display,
     ):
         self.path_instructions = path_instructions
         self.path_data = path_data
@@ -33,6 +37,23 @@ class Visualization:
         self.num_docs_for_words = num_docs_for_words
         self.max_len_text_display = max_len_text_display
     def preamble(self):
         st.markdown(
             "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
@@ -159,6 +180,7 @@ class Visualization:
                     "repetitions_ratio",
                     cutoff_repetitions_ratio,
                     True,
                 )
                 keys.append(new_key)
                 cond = get_cond(new_key[0], new_key[1], new_key[2])
@@ -392,8 +414,104 @@ class Visualization:
                 ax.set_ylabel("frequency in the documents")
                 st.pyplot(fig)
-    def check_personal_doc(self):
-        pass
     def download_data(self):
         st.header("Download data")
@@ -413,7 +531,7 @@ class Visualization:
         self.filtering_of_words()
         self.plot_distributions_filtering_parameters()
         #self.plot_zipf_law()
-        self.check_personal_doc()
         self.download_data()
@@ -424,6 +542,12 @@ num_docs = 5000
 num_docs_for_words = 500
 max_len_text_display = 10000
 visualization = Visualization(
     path_instructions,
     path_data,
@@ -431,5 +555,9 @@ visualization = Visualization(
     num_docs,
     num_docs_for_words,
     max_len_text_display,
 )
 visualization.visualization()

 import matplotlib.pyplot as plt
+from filtering import LoadParameters, ModifyingDocuments, Filtering
 class Visualization:
         num_docs,
         num_docs_for_words,
         max_len_text_display,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
     ):
         self.path_instructions = path_instructions
         self.path_data = path_data
         self.num_docs_for_words = num_docs_for_words
         self.max_len_text_display = max_len_text_display
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.badwords = LoadParameters.load_badwords(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
     def preamble(self):
         st.markdown(
             "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
                     "repetitions_ratio",
                     cutoff_repetitions_ratio,
                     True,
+                    repetitions_length,
                 )
                 keys.append(new_key)
                 cond = get_cond(new_key[0], new_key[1], new_key[2])
                 ax.set_ylabel("frequency in the documents")
                 st.pyplot(fig)
+    def analyse_personal_doc(self):
+        st.header("Analyse your own document")
+        personal_doc = st.text_area(
+            label="Paste here the document you want to analyse",
+            value="",
+            max_chars=10000,
+        )
+        is_discarded = False
+        def is_doc_discarded(key, score):
+            if key[2]: # max cutoff
+                return score > key[1]
+            else:
+                return score < key[1]
+        for key in self.keys:
+            if key[0] == "number_words":
+                words = ModifyingDocuments.get_words_from_document(
+                    personal_doc,
+                    self.sentencepiece_model_tok,
+                    lower_case=False,
+                    strip_characters=self.param["strip_characters"],
+                )
+                if key[2]:
+                    st.markdown(f"Number of words: {len(words)}")
+                if is_doc_discarded(key, len(words)):
+                    is_discarded = True
+            elif key[0] == "repetitions_ratio":
+                repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
+                repetitions_ratio = round(repetitions_ratio, 3)
+                st.markdown(f"Repetitions ratio: {repetitions_ratio}")
+                if is_doc_discarded(key, repetitions_ratio):
+                    is_discarded = True
+            elif key[0] == "special_characters_ratio":
+                special_characters_ratio = Filtering.compute_special_characters_ratio(
+                    personal_doc, self.param["special_characters"]
+                )
+                special_characters_ratio = round(special_characters_ratio, 3)
+                st.markdown(f"Special characters ratio: {special_characters_ratio}")
+                if is_doc_discarded(key, special_characters_ratio):
+                    is_discarded = True
+            elif key[0] == "stopwords_ratio":
+                stopwords_ratio = Filtering.compute_stopwords_ratio(
+                    personal_doc,
+                    self.sentencepiece_model_tok,
+                    self.param["strip_characters"],
+                    self.param["cond_words_augmentation"],
+                    self.param["words_augmentation_group_sizes"],
+                    self.param["words_augmentation_join_char"],
+                    self.stopwords,
+                )
+                stopwords_ratio = round(stopwords_ratio, 3)
+                st.markdown(f"Stop words ratio: {stopwords_ratio}")
+                if is_doc_discarded(key, stopwords_ratio):
+                    is_discarded = True
+            elif key[0] == "badwords_ratio":
+                badwords_ratio = Filtering.compute_badwords_ratio(
+                    personal_doc,
+                    self.sentencepiece_model_tok,
+                    self.param["strip_characters"],
+                    self.param["cond_words_augmentation"],
+                    self.param["words_augmentation_group_sizes"],
+                    self.param["words_augmentation_join_char"],
+                    self.badwords,
+                )
+                badwords_ratio = round(badwords_ratio, 3)
+                st.markdown(f"Flagged words ratio: {badwords_ratio}")
+                if is_doc_discarded(key, badwords_ratio):
+                    is_discarded = True
+            elif key[0] == "lang_id_score":
+                lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
+                    personal_doc, self.model_lang_id
+                )
+                lang_id_score = round(lang_id_score, 3)
+                st.markdown(f"Language identification confidence score: {lang_id_score}")
+                if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
+                    is_discarded = True
+            elif key[0] == "perplexity_score":
+                perplexity_score = Filtering.compute_perplexity_score(
+                    personal_doc,
+                    self.sentencepiece_model,
+                    self.kenlm_model,
+                )
+                perplexity_score = round(perplexity_score, 3)
+                st.markdown(f"Perplexity score: {perplexity_score}")
+                if is_doc_discarded(key, perplexity_score):
+                    is_discarded = True
+        is_discarded = "" if is_discarded else "not "
+        st.markdown(f"With the current filtering parameters, this document is {is_discarded}discarded.")
     def download_data(self):
         st.header("Download data")
         self.filtering_of_words()
         self.plot_distributions_filtering_parameters()
         #self.plot_zipf_law()
+        self.analyse_personal_doc()
         self.download_data()
 num_docs_for_words = 500
 max_len_text_display = 10000
+# Only useful for analyse_personal_doc
+lang_dataset_id = "en"
+path_fasttext_model = "./lid.176.bin"
+path_sentencepiece_model = "./en.sp.model"
+path_kenlm_model = "./en.arpa.bin"
 visualization = Visualization(
     path_instructions,
     path_data,
     num_docs,
     num_docs_for_words,
     max_len_text_display,
+    lang_dataset_id,
+    path_fasttext_model,
+    path_sentencepiece_model,
+    path_kenlm_model,
 )
 visualization.visualization()

parameters_filtering.py CHANGED Viewed

@@ -7,8 +7,8 @@ other_special_characters = (
     "    　    ’“”–ー一▬…✦�£•€«»°·═"
     "×士＾˘⇓↓↑←→（）§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
     "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
-    "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬x？▷Г♫∟™ª₪®「—"
-    "❖」﴾》"
 )
 emoji = list(emoji.UNICODE_EMOJI["en"].keys())

     "    　    ’“”–ー一▬…✦�£•€«»°·═"
     "×士＾˘⇓↓↑←→（）§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
     "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
+    "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬？▷Г♫∟™ª₪®「—❖"
+    "」﴾》"
 )
 emoji = list(emoji.UNICODE_EMOJI["en"].keys())