HugoLaurencon commited on
Commit
ffdfff7
·
1 Parent(s): af427be

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -64
app.py CHANGED
@@ -2,70 +2,153 @@ import streamlit as st
2
  import json
3
  import pandas as pd
4
  import numpy as np
 
5
 
6
 
7
- st.title('5k English documents from Oscar with their stats.')
8
-
9
- path_data = "./10K_english_examples_with_stats.json"
10
- with open(path_data) as json_file:
11
- data = json.load(json_file)
12
-
13
- data = data[:5000]
14
- data = pd.DataFrame(data)
15
- del data["len_words"]
16
-
17
- st.header('Parameters of the filtering')
18
-
19
- cutoff_special_characters_ratio = st.slider("Max cutoff special characters ratio", 0., 1., 1., step=0.01)
20
- cutoff_stopwords_ratio = st.slider("Min cutoff stopwords ratio", 0., 1., 0., step=0.01)
21
- cutoff_badwords_ratio = st.slider("Max cutoff badwords ratio", 0., 1., 1., step=0.001)
22
- cutoff_lang_id_score = st.slider("Min cutoff lang id score", 0., 1., 0., step=0.01)
23
- cutoff_perplexity_score = st.slider("Perplexity cutoff perplexity score", 0, 14000000, 14000000)
24
-
25
- keys = [
26
- ("special_characters_ratio", cutoff_special_characters_ratio, True),
27
- ("stopwords_ratio", cutoff_stopwords_ratio, False),
28
- ("badwords_ratio", cutoff_badwords_ratio, True),
29
- ("lang_id_score", cutoff_lang_id_score, False),
30
- ("perplexity_score", cutoff_perplexity_score, True),
31
- ]
32
-
33
- cond = [(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) for key, cutoff, max_cutoff in keys]
34
- cond = np.all(cond, axis=0)
35
-
36
- data_keep = data.loc[cond]
37
- st.header('Data that we keep')
38
- st.markdown("Click on a column to sort by it.")
39
- st.markdown("Place the cursor on the text to display it.")
40
- st.dataframe(data_keep)
41
-
42
- data_not_keep = data.loc[np.invert(cond)]
43
- st.header('Data that is thrown away')
44
- st.markdown("Click on a column to sort by it.")
45
- st.markdown("Place the cursor on the text to display it.")
46
- st.dataframe(data_not_keep)
47
-
48
- def plot_hist(key, num_bins=50):
49
- st.header(" ".join(key.split("_")))
50
- hist_values = data[key].values
51
- max_range = np.max(hist_values)
52
- hist_values = np.histogram(
53
- hist_values,
54
- bins=num_bins,
55
- range=(0,max_range)
56
- )[0]
57
- st.bar_chart(hist_values)
58
- st.markdown(f"Each bin is of size: {max_range/num_bins}.")
59
-
60
- for key, _, _ in keys:
61
- plot_hist(key)
62
-
63
- st.header('Download data')
64
-
65
- with open(path_data) as json_file:
66
- btn = st.download_button(
67
- label="Download data as json",
68
- data=json_file,
69
- file_name='data.json',
70
- )
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
  import pandas as pd
4
  import numpy as np
5
+ import matplotlib.pyplot as plt
6
 
7
 
8
+ def visualization(path_data, lang, num_docs, num_docs_for_words):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ with open(path_data) as json_file:
11
+ data = json.load(json_file)
12
+
13
+ num_docs = min(num_docs, len(data))
14
+
15
+ st.title(f"{num_docs} {lang} documents from Oscar with their stats.")
16
+
17
+ sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
18
+ words = [word for sentence in sentences for word in sentence]
19
+ words_data = [{"len_word": len(word), "word": word} for word in words]
20
+ words_data = pd.DataFrame(words_data)
21
+
22
+ data = data[:num_docs]
23
+ data = pd.DataFrame(data)
24
+
25
+ columns = list(data)
26
+ keys = []
27
+
28
+ st.header("Parameters of the filtering")
29
+
30
+ if "special_characters_ratio" in columns:
31
+ cutoff_special_characters_ratio = st.slider(
32
+ "Max cutoff special characters ratio", 0.0, 1.0, 1.0, step=0.01
33
+ )
34
+ keys.append(("special_characters_ratio", cutoff_special_characters_ratio, True))
35
+
36
+ if "stopwords_ratio" in columns:
37
+ cutoff_stopwords_ratio = st.slider(
38
+ "Min cutoff stopwords ratio", 0.0, 1.0, 0.0, step=0.01
39
+ )
40
+ keys.append(("stopwords_ratio", cutoff_stopwords_ratio, False))
41
+
42
+ if "badwords_ratio" in columns:
43
+ cutoff_badwords_ratio = st.slider(
44
+ "Max cutoff badwords ratio", 0.0, 1.0, 1.0, step=0.001
45
+ )
46
+ keys.append(("badwords_ratio", cutoff_badwords_ratio, True))
47
+
48
+ if "lang_id_score" in columns:
49
+ cutoff_lang_id_score = st.slider(
50
+ "Min cutoff lang id score", 0.0, 1.0, 0.0, step=0.01
51
+ )
52
+ keys.append(("lang_id_score", cutoff_lang_id_score, False))
53
+
54
+ if "perplexity_score" in columns:
55
+ max_pp = int(np.max(data["perplexity_score"])) + 1
56
+ cutoff_perplexity_score = st.slider(
57
+ "Perplexity cutoff perplexity score", 0, max_pp, max_pp
58
+ )
59
+ keys.append(("perplexity_score", cutoff_perplexity_score, True))
60
+
61
+ cond = [
62
+ (data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
63
+ for key, cutoff, max_cutoff in keys
64
+ ]
65
+ cond = np.all(cond, axis=0)
66
+
67
+ data_keep = data.loc[cond]
68
+ st.header("Data that we keep")
69
+ st.markdown("Click on a column to sort by it.")
70
+ st.markdown("Place the cursor on the text to display it.")
71
+ st.dataframe(data_keep)
72
+
73
+ data_not_keep = data.loc[np.invert(cond)]
74
+ st.header("Data that is thrown away")
75
+ st.markdown("Click on a column to sort by it.")
76
+ st.markdown("Place the cursor on the text to display it.")
77
+ st.dataframe(data_not_keep)
78
+
79
+ def plot_hist(dataframe, key, num_bins=50):
80
+ st.header(" ".join(key.split("_")))
81
+ hist_values = dataframe[key].values
82
+ max_range = np.max(hist_values)
83
+ hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
84
+ st.bar_chart(hist_values)
85
+ st.markdown(f"Each bin is of size: {max_range/num_bins}.")
86
+
87
+ for key, _, _ in keys:
88
+ plot_hist(data, key)
89
+
90
+ st.header("Zipf's Law")
91
+
92
+ def get_frequency_words(data):
93
+ freq_words = {}
94
+ for index, row in data.iterrows():
95
+ for word in row["text"].split(" "):
96
+ if word in freq_words:
97
+ freq_words[word] += 1
98
+ else:
99
+ freq_words[word] = 1
100
+ freq_words = np.array(list(freq_words.values()))
101
+ freq_words = -np.sort(-freq_words)
102
+ return freq_words
103
+
104
+ freq_words_data = get_frequency_words(data)
105
+ freq_words_data_keep = get_frequency_words(data_keep)
106
+ freq_words_data_not_keep = get_frequency_words(data_not_keep)
107
+
108
+ fig, ax = plt.subplots()
109
+ ax.loglog(freq_words_data)
110
+ ax.loglog(freq_words_data_keep)
111
+ ax.loglog(freq_words_data_not_keep)
112
+ ax.set_title("Zipf's Law")
113
+ ax.set_xlabel("$i$-th most frequent word")
114
+ ax.set_ylabel("frequency in the documents")
115
+ ax.legend(["All data", "Data that we keep", "Data that is thrown away"])
116
+ st.pyplot(fig)
117
+
118
+ st.markdown("If less than three curves are displayed, it means that there are overlaps.")
119
+
120
+ st.header("Parameter of the filtering for words")
121
+ max_len_word = int(np.max(words_data["len_word"])) + 1
122
+ cutoff_word = st.slider("Max cutoff length word", 0, max_len_word, max_len_word)
123
+ cond_words = words_data["len_word"] <= cutoff_word
124
+
125
+ words_keep = words_data.loc[cond_words]
126
+ st.header(f"Words that we keep (for {num_docs_for_words} documents)")
127
+ st.markdown("Click on a column to sort by it.")
128
+ st.markdown("Place the cursor on the text to display it.")
129
+ st.dataframe(words_keep)
130
+
131
+ words_not_keep = words_data.loc[np.invert(cond_words)]
132
+ st.header(f"Words that are thrown away (for {num_docs_for_words} documents)")
133
+ st.markdown("Click on a column to sort by it.")
134
+ st.markdown("Place the cursor on the text to display it.")
135
+ st.dataframe(words_not_keep)
136
+
137
+ plot_hist(words_data, "len_word")
138
+
139
+ st.header("Download data")
140
+
141
+ with open(path_data) as json_file:
142
+ btn = st.download_button(
143
+ label="Download data as json",
144
+ data=json_file,
145
+ file_name="data.json",
146
+ )
147
+
148
+
149
+ path_data = "./en_examples_with_stats.json"
150
+ lang = "English"
151
+ num_docs = 5000
152
+ num_docs_for_words = 500
153
+
154
+ visualization(path_data, lang, num_docs, num_docs_for_words)