Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Nov 25, 2021

Commit

ffdfff7

1 Parent(s): af427be

Upload app.py

Browse files

Files changed (1) hide show

app.py +147 -64

app.py CHANGED Viewed

@@ -2,70 +2,153 @@ import streamlit as st
 import json
 import pandas as pd
 import numpy as np
-st.title('5k English documents from Oscar with their stats.')
-path_data = "./10K_english_examples_with_stats.json"
-with open(path_data) as json_file:
-    data = json.load(json_file)
-data = data[:5000]
-data = pd.DataFrame(data)
-del data["len_words"]
-st.header('Parameters of the filtering')
-cutoff_special_characters_ratio = st.slider("Max cutoff special characters ratio", 0., 1., 1., step=0.01)
-cutoff_stopwords_ratio = st.slider("Min cutoff stopwords ratio", 0., 1., 0., step=0.01)
-cutoff_badwords_ratio = st.slider("Max cutoff badwords ratio", 0., 1., 1., step=0.001)
-cutoff_lang_id_score = st.slider("Min cutoff lang id score", 0., 1., 0., step=0.01)
-cutoff_perplexity_score = st.slider("Perplexity cutoff perplexity score", 0, 14000000, 14000000)
-keys = [
-    ("special_characters_ratio", cutoff_special_characters_ratio, True),
-    ("stopwords_ratio", cutoff_stopwords_ratio, False),
-    ("badwords_ratio", cutoff_badwords_ratio, True),
-    ("lang_id_score", cutoff_lang_id_score, False),
-    ("perplexity_score", cutoff_perplexity_score, True),
-]
-cond = [(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) for key, cutoff, max_cutoff in keys]
-cond = np.all(cond, axis=0)
-data_keep = data.loc[cond]
-st.header('Data that we keep')
-st.markdown("Click on a column to sort by it.")
-st.markdown("Place the cursor on the text to display it.")
-st.dataframe(data_keep)
-data_not_keep = data.loc[np.invert(cond)]
-st.header('Data that is thrown away')
-st.markdown("Click on a column to sort by it.")
-st.markdown("Place the cursor on the text to display it.")
-st.dataframe(data_not_keep)
-def plot_hist(key, num_bins=50):
-    st.header(" ".join(key.split("_")))
-    hist_values = data[key].values
-    max_range = np.max(hist_values)
-    hist_values = np.histogram(
-        hist_values,
-        bins=num_bins,
-        range=(0,max_range)
-    )[0]
-    st.bar_chart(hist_values)
-    st.markdown(f"Each bin is of size: {max_range/num_bins}.")
-for key, _, _ in keys:
-    plot_hist(key)
-st.header('Download data')
-with open(path_data) as json_file:
-    btn = st.download_button(
-        label="Download data as json",
-        data=json_file,
-        file_name='data.json',
-    )

 import json
 import pandas as pd
 import numpy as np
+import matplotlib.pyplot as plt
+def visualization(path_data, lang, num_docs, num_docs_for_words):
+    with open(path_data) as json_file:
+        data = json.load(json_file)
+    num_docs = min(num_docs, len(data))
+    st.title(f"{num_docs} {lang} documents from Oscar with their stats.")
+    sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
+    words = [word for sentence in sentences for word in sentence]
+    words_data = [{"len_word": len(word), "word": word} for word in words]
+    words_data = pd.DataFrame(words_data)
+    data = data[:num_docs]
+    data = pd.DataFrame(data)
+    columns = list(data)
+    keys = []
+    st.header("Parameters of the filtering")
+    if "special_characters_ratio" in columns:
+        cutoff_special_characters_ratio = st.slider(
+            "Max cutoff special characters ratio", 0.0, 1.0, 1.0, step=0.01
+        )
+        keys.append(("special_characters_ratio", cutoff_special_characters_ratio, True))
+    if "stopwords_ratio" in columns:
+        cutoff_stopwords_ratio = st.slider(
+            "Min cutoff stopwords ratio", 0.0, 1.0, 0.0, step=0.01
+        )
+        keys.append(("stopwords_ratio", cutoff_stopwords_ratio, False))
+    if "badwords_ratio" in columns:
+        cutoff_badwords_ratio = st.slider(
+            "Max cutoff badwords ratio", 0.0, 1.0, 1.0, step=0.001
+        )
+        keys.append(("badwords_ratio", cutoff_badwords_ratio, True))
+    if "lang_id_score" in columns:
+        cutoff_lang_id_score = st.slider(
+            "Min cutoff lang id score", 0.0, 1.0, 0.0, step=0.01
+        )
+        keys.append(("lang_id_score", cutoff_lang_id_score, False))
+    if "perplexity_score" in columns:
+        max_pp = int(np.max(data["perplexity_score"])) + 1
+        cutoff_perplexity_score = st.slider(
+            "Perplexity cutoff perplexity score", 0, max_pp, max_pp
+        )
+        keys.append(("perplexity_score", cutoff_perplexity_score, True))
+    cond = [
+        (data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
+        for key, cutoff, max_cutoff in keys
+    ]
+    cond = np.all(cond, axis=0)
+    data_keep = data.loc[cond]
+    st.header("Data that we keep")
+    st.markdown("Click on a column to sort by it.")
+    st.markdown("Place the cursor on the text to display it.")
+    st.dataframe(data_keep)
+    data_not_keep = data.loc[np.invert(cond)]
+    st.header("Data that is thrown away")
+    st.markdown("Click on a column to sort by it.")
+    st.markdown("Place the cursor on the text to display it.")
+    st.dataframe(data_not_keep)
+    def plot_hist(dataframe, key, num_bins=50):
+        st.header(" ".join(key.split("_")))
+        hist_values = dataframe[key].values
+        max_range = np.max(hist_values)
+        hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
+        st.bar_chart(hist_values)
+        st.markdown(f"Each bin is of size: {max_range/num_bins}.")
+    for key, _, _ in keys:
+        plot_hist(data, key)
+    st.header("Zipf's Law")
+    def get_frequency_words(data):
+        freq_words = {}
+        for index, row in data.iterrows():
+            for word in row["text"].split(" "):
+                if word in freq_words:
+                    freq_words[word] += 1
+                else:
+                    freq_words[word] = 1
+        freq_words = np.array(list(freq_words.values()))
+        freq_words = -np.sort(-freq_words)
+        return freq_words
+    freq_words_data = get_frequency_words(data)
+    freq_words_data_keep = get_frequency_words(data_keep)
+    freq_words_data_not_keep = get_frequency_words(data_not_keep)
+    fig, ax = plt.subplots()
+    ax.loglog(freq_words_data)
+    ax.loglog(freq_words_data_keep)
+    ax.loglog(freq_words_data_not_keep)
+    ax.set_title("Zipf's Law")
+    ax.set_xlabel("$i$-th most frequent word")
+    ax.set_ylabel("frequency in the documents")
+    ax.legend(["All data", "Data that we keep", "Data that is thrown away"])
+    st.pyplot(fig)
+    st.markdown("If less than three curves are displayed, it means that there are overlaps.")
+    st.header("Parameter of the filtering for words")
+    max_len_word = int(np.max(words_data["len_word"])) + 1
+    cutoff_word = st.slider("Max cutoff length word", 0, max_len_word, max_len_word)
+    cond_words = words_data["len_word"] <= cutoff_word
+    words_keep = words_data.loc[cond_words]
+    st.header(f"Words that we keep (for {num_docs_for_words} documents)")
+    st.markdown("Click on a column to sort by it.")
+    st.markdown("Place the cursor on the text to display it.")
+    st.dataframe(words_keep)
+    words_not_keep = words_data.loc[np.invert(cond_words)]
+    st.header(f"Words that are thrown away (for {num_docs_for_words} documents)")
+    st.markdown("Click on a column to sort by it.")
+    st.markdown("Place the cursor on the text to display it.")
+    st.dataframe(words_not_keep)
+    plot_hist(words_data, "len_word")
+    st.header("Download data")
+    with open(path_data) as json_file:
+        btn = st.download_button(
+            label="Download data as json",
+            data=json_file,
+            file_name="data.json",
+        )
+path_data = "./en_examples_with_stats.json"
+lang = "English"
+num_docs = 5000
+num_docs_for_words = 500
+visualization(path_data, lang, num_docs, num_docs_for_words)