Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

teven commited on Nov 29, 2021

Commit

c8f45af

1 Parent(s): 96e0b3b

better description, flagged words

Browse files

Files changed (2) hide show

app.py +20 -20
en_examples_with_stats_ldnoob.json +2 -2

app.py CHANGED Viewed

@@ -30,51 +30,51 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     if "special_%" in columns:
         special_ratio = st.sidebar.slider(
-            "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1.0
         )
         cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
         special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
-        st.sidebar.text(f"Kept text with <{special_cutoff:.1f}% special chars")
         keys.append(("special_%", special_cutoff, True))
     if "stop_%" in columns:
         stop_ratio = st.sidebar.slider(
-            "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1.0
         )
         cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
         stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
-        st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words")
         keys.append(("stop_%", stop_cutoff, False))
     @st.cache(suppress_st_warning=True)
-    def recalculate_bad_words(file):
-        def bad_word_ratio(text: str, bad_word_list):
-            return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split())
-        bad_word_list = [word.decode().strip() for word in file.readlines()]
-        bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
-        data["bad_%"] = bad_word_ratios
-    bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
-    if "bad_%" in columns:
-        bad_ratio = st.sidebar.slider(
-            "% filtered by badwords ratio", 0.0, 50.0, 0.0, step=0.1
         )
-        bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
-        bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
-        st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words")
-        keys.append(("bad_%", bad_cutoff, True))
     if "perplexity" in columns:
         ppl_ratio = st.sidebar.slider(
-            "% filtered by perplexity", 0.0, 50.0, 0.0, step=1.0
         )
         ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
         ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
-        st.sidebar.text(f"Kept text with <{ppl_cutoff:.0f} perplexity")
         keys.append(("perplexity", ppl_cutoff, True))
     cond = [

     if "special_%" in columns:
         special_ratio = st.sidebar.slider(
+            "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=0.1
         )
         cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
         special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
+        st.sidebar.text(f"No docs with <{special_cutoff:.1f}% special chars")
         keys.append(("special_%", special_cutoff, True))
     if "stop_%" in columns:
         stop_ratio = st.sidebar.slider(
+            "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=0.1
         )
         cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
         stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
+        st.sidebar.text(f"No docs with >{stop_cutoff:.2f}% stop words")
         keys.append(("stop_%", stop_cutoff, False))
     @st.cache(suppress_st_warning=True)
+    def recalculate_flagged_words(file):
+        def flagged_word_ratio(text: str, flagged_word_list):
+            return len([word for word in text.split() if word.lower().strip() in flagged_word_list]) / len(text.split())
+        flagged_word_list = [word.decode().strip() for word in file.readlines()]
+        flagged_word_ratios = [flagged_word_ratio(text, flagged_word_list) * 100 for text in data["text"]]
+        data["flagged_%"] = flagged_word_ratios
+    flagged_word_file = st.sidebar.file_uploader("Upload your own list of flagged words (1 word per line)")
+    if "flagged_%" in columns:
+        flagged_ratio = st.sidebar.slider(
+            "% filtered by flaggedwords ratio", 0.0, 50.0, 0.0, step=0.1
         )
+        flagged_index = max(0, math.floor((100 - flagged_ratio) * len(data.index) / 100) - 1)
+        flagged_cutoff = np.partition(data["flagged_%"], flagged_index)[flagged_index]
+        st.sidebar.text(f"No docs with >{flagged_cutoff:.2f}% flagged words")
+        keys.append(("flagged_%", flagged_cutoff, True))
     if "perplexity" in columns:
         ppl_ratio = st.sidebar.slider(
+            "% filtered by perplexity", 0.0, 50.0, 0.0, step=0.1
         )
         ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
         ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
+        st.sidebar.text(f"No docs with >{ppl_cutoff:.0f} perplexity")
         keys.append(("perplexity", ppl_cutoff, True))
     cond = [

en_examples_with_stats_ldnoob.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f82e7ebdbb2054c2f6dbfab70ce589e0972fa61fd7e08b778e848e07537c4e1
-size 21187243

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e4e2a111df4e1a3243d53c9516baf8a3f495f8faec5b86fe8787bc6dc2a03bc
+size 21206447