Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

teven commited on Nov 26, 2021

Commit

64ce142

•

1 Parent(s): 1fed88b

better sliders

Browse files

Files changed (1) hide show

app.py +28 -13

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ import matplotlib.pyplot as plt
 def visualization(path_data, lang, num_docs, num_docs_for_words):
     with open(path_data) as json_file:
         data = json.load(json_file)
@@ -29,10 +28,9 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     st.header("Filtering based on document content")
     if "special_%" in columns:
         special_ratio = st.sidebar.slider(
-            "% filtered by special characters ratio", 0.0, 100.0, 0.0, step=1.0
         )
         cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
         special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
@@ -41,16 +39,33 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     if "stop_%" in columns:
         stop_ratio = st.sidebar.slider(
-            "% filtered by stop word ratio", 0.0, 100.0, 0.0, step=1.0
         )
         cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
         stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
         st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
         keys.append(("stop_%", stop_cutoff, False))
     if "bad_%" in columns:
         bad_ratio = st.sidebar.slider(
-            "% filtered by badwords ratio", 0.0, 100.0, 0.0, step=1.0
         )
         bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
         bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
@@ -59,7 +74,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     if "perplexity" in columns:
         ppl_ratio = st.sidebar.slider(
-            "% filtered by perplexity", 0.0, 100.0, 0.0, step=1.0
         )
         ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
         ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
@@ -82,13 +97,13 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
     st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
     st.dataframe(data_keep)
-    def plot_hist(dataframe, key, num_bins=50):
-        st.subheader(" ".join(key.split("_")))
-        hist_values = dataframe[key].values
-        max_range = np.max(hist_values)
-        hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
-        st.bar_chart(hist_values)
-        st.markdown(f"Each bin is of size: {max_range/num_bins}.")
     # for key, _, _ in keys:
     #     plot_hist(data, key)

 def visualization(path_data, lang, num_docs, num_docs_for_words):
     with open(path_data) as json_file:
         data = json.load(json_file)
     st.header("Filtering based on document content")
     if "special_%" in columns:
         special_ratio = st.sidebar.slider(
+            "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1.0
         )
         cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
         special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
     if "stop_%" in columns:
         stop_ratio = st.sidebar.slider(
+            "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1.0
         )
         cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
         stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
         st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
         keys.append(("stop_%", stop_cutoff, False))
+    # def recalculate_bad_words(file):
+    #
+    #     def bad_word_ratio(text: str, bad_word_list):
+    #         return sum(
+    #             [text.count(bad_word.decode()) * len(bad_word.decode().split()) for bad_word in bad_word_list]) / len(
+    #             text.split())
+    #
+    #     bad_word_list = file.readlines()
+    #
+    #     bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
+    #     data["bad_%"] = bad_word_ratios
+    #
+    # bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
+    #
+    # if bad_word_file is not None:
+    #     recalculate_bad_words(bad_word_file)
     if "bad_%" in columns:
         bad_ratio = st.sidebar.slider(
+            "% filtered by badwords ratio", 0.0, 50.0, 0.0, step=0.1
         )
         bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
         bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
     if "perplexity" in columns:
         ppl_ratio = st.sidebar.slider(
+            "% filtered by perplexity", 0.0, 50.0, 0.0, step=1.0
         )
         ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
         ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
     st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
     st.dataframe(data_keep)
+    # def plot_hist(dataframe, key, num_bins=50):
+    #     st.subheader(" ".join(key.split("_")))
+    #     hist_values = dataframe[key].values
+    #     max_range = np.max(hist_values)
+    #     hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
+    #     st.bar_chart(hist_values)
+    #     st.markdown(f"Each bin is of size: {max_range/num_bins}.")
     # for key, _, _ in keys:
     #     plot_hist(data, key)