HugoLaurencon commited on
Commit
6303415
·
1 Parent(s): 07c617e
Files changed (2) hide show
  1. app.py +41 -15
  2. filtering_pipeline_oscar.pdf +0 -0
app.py CHANGED
@@ -2,6 +2,9 @@
2
 
3
  import streamlit as st
4
 
 
 
 
5
  import json
6
  import pandas as pd
7
 
@@ -12,14 +15,27 @@ import matplotlib.pyplot as plt
12
 
13
  class Visualization:
14
  def __init__(
15
- self, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
16
  ):
 
17
  self.path_data = path_data
18
  self.lang = lang
19
  self.num_docs = num_docs
20
  self.num_docs_for_words = num_docs_for_words
21
  self.max_len_text_display = max_len_text_display
22
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def open_data(self):
24
  with open(self.path_data) as json_file:
25
  data = json.load(json_file)
@@ -42,7 +58,7 @@ class Visualization:
42
  self.docs = pd.DataFrame(docs)
43
 
44
  def set_title(self):
45
- st.title(f"{self.num_docs} {self.lang} documents from Oscar with their stats.")
46
 
47
  def filtering_of_docs(self):
48
  st.sidebar.subheader("Parameters of the filtering on documents")
@@ -59,14 +75,15 @@ class Visualization:
59
 
60
  def print_discared_by_cond(cond):
61
  st.sidebar.caption(
62
- f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter"
63
  )
64
  st.sidebar.caption("---------")
65
 
66
  if "number_words" in columns:
 
67
  max_nb_words = int(np.max(docs["number_words"])) + 1
68
  cutoff_min_number_words = st.sidebar.slider(
69
- "Min cutoff number words", 0, max_nb_words, 0
70
  )
71
  new_key = ("number_words", cutoff_min_number_words, False)
72
  keys.append(new_key)
@@ -74,8 +91,9 @@ class Visualization:
74
  conds.append(cond)
75
  print_discared_by_cond(cond)
76
 
 
77
  cutoff_max_number_words = st.sidebar.slider(
78
- "Max cutoff number words", 0, max_nb_words, max_nb_words
79
  )
80
  new_key = ("number_words", cutoff_max_number_words, True)
81
  keys.append(new_key)
@@ -84,8 +102,9 @@ class Visualization:
84
  print_discared_by_cond(cond)
85
 
86
  if "special_characters_ratio" in columns:
 
87
  cutoff_special_characters_ratio = st.sidebar.slider(
88
- "Max cutoff special characters ratio", 0.0, 1.0, 1.0, step=0.01
89
  )
90
  new_key = (
91
  "special_characters_ratio",
@@ -98,8 +117,9 @@ class Visualization:
98
  print_discared_by_cond(cond)
99
 
100
  if "stopwords_ratio" in columns:
 
101
  cutoff_stopwords_ratio = st.sidebar.slider(
102
- "Min cutoff stopwords ratio", 0.0, 1.0, 0.0, step=0.01
103
  )
104
  new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
105
  keys.append(new_key)
@@ -108,8 +128,9 @@ class Visualization:
108
  print_discared_by_cond(cond)
109
 
110
  if "badwords_ratio" in columns:
 
111
  cutoff_badwords_ratio = st.sidebar.slider(
112
- "Max cutoff badwords ratio", 0.0, 1.0, 1.0, step=0.01
113
  )
114
  new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
115
  keys.append(new_key)
@@ -118,8 +139,9 @@ class Visualization:
118
  print_discared_by_cond(cond)
119
 
120
  if "lang_id_score" in columns:
 
121
  cutoff_lang_id_score = st.sidebar.slider(
122
- "Min cutoff lang id score", 0.0, 1.0, 0.0, step=0.01
123
  )
124
  new_key = ("lang_id_score", cutoff_lang_id_score, False)
125
  keys.append(new_key)
@@ -128,9 +150,10 @@ class Visualization:
128
  print_discared_by_cond(cond)
129
 
130
  if "perplexity_score" in columns:
 
131
  max_pp = int(np.max(docs["perplexity_score"])) + 1
132
  cutoff_perplexity_score = st.sidebar.slider(
133
- "Perplexity cutoff perplexity score", 0, max_pp, max_pp
134
  )
135
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
136
  keys.append(new_key)
@@ -167,13 +190,14 @@ class Visualization:
167
  def filtering_of_words(self):
168
  st.sidebar.subheader("Parameter of the filtering on words")
169
 
170
- max_len_word = int(np.max(self.words["len_word"])) + 1
171
- cutoff_word = st.sidebar.slider(
172
- "Max cutoff length word", 0, max_len_word, max_len_word
173
  )
 
 
174
 
175
  incorrect_substrings = st.sidebar.checkbox(
176
- "Remove words with incorrect substrings"
177
  )
178
 
179
  cond_words = self.words["len_word"] <= cutoff_word
@@ -258,6 +282,7 @@ class Visualization:
258
  )
259
 
260
  def visualization(self):
 
261
  self.open_data()
262
  self.set_title()
263
  self.filtering_of_docs()
@@ -267,6 +292,7 @@ class Visualization:
267
  self.download_data()
268
 
269
 
 
270
  path_data = "./en_examples_with_stats.json"
271
  lang = "English"
272
  num_docs = 5000
@@ -274,6 +300,6 @@ num_docs_for_words = 500
274
  max_len_text_display = 10000
275
 
276
  visualization = Visualization(
277
- path_data, lang, num_docs, num_docs_for_words, max_len_text_display
278
  )
279
  visualization.visualization()
 
2
 
3
  import streamlit as st
4
 
5
+ import os
6
+
7
+ import base64
8
  import json
9
  import pandas as pd
10
 
 
15
 
16
  class Visualization:
17
  def __init__(
18
+ self, path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
19
  ):
20
+ self.path_instructions = path_instructions
21
  self.path_data = path_data
22
  self.lang = lang
23
  self.num_docs = num_docs
24
  self.num_docs_for_words = num_docs_for_words
25
  self.max_len_text_display = max_len_text_display
26
 
27
+ def preamble(self):
28
+ st.markdown("Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail.")
29
+
30
+ def get_binary_file_downloader_html(bin_file, file_label='File'):
31
+ with open(bin_file, 'rb') as f:
32
+ data = f.read()
33
+ bin_str = base64.b64encode(data).decode()
34
+ href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
35
+ return href
36
+
37
+ st.markdown(get_binary_file_downloader_html(self.path_instructions, "Download the filtering pipeline of OSCAR as pdf"), unsafe_allow_html=True)
38
+
39
  def open_data(self):
40
  with open(self.path_data) as json_file:
41
  data = json.load(json_file)
 
58
  self.docs = pd.DataFrame(docs)
59
 
60
  def set_title(self):
61
+ st.title(f"{self.num_docs} {self.lang} documents from OSCAR with their stats.")
62
 
63
  def filtering_of_docs(self):
64
  st.sidebar.subheader("Parameters of the filtering on documents")
 
75
 
76
  def print_discared_by_cond(cond):
77
  st.sidebar.caption(
78
+ f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
79
  )
80
  st.sidebar.caption("---------")
81
 
82
  if "number_words" in columns:
83
+ cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
84
  max_nb_words = int(np.max(docs["number_words"])) + 1
85
  cutoff_min_number_words = st.sidebar.slider(
86
+ cutoff_def, 0, min(max_nb_words, 500), 0
87
  )
88
  new_key = ("number_words", cutoff_min_number_words, False)
89
  keys.append(new_key)
 
91
  conds.append(cond)
92
  print_discared_by_cond(cond)
93
 
94
+ cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
95
  cutoff_max_number_words = st.sidebar.slider(
96
+ cutoff_def, 0, max_nb_words, max_nb_words
97
  )
98
  new_key = ("number_words", cutoff_max_number_words, True)
99
  keys.append(new_key)
 
102
  print_discared_by_cond(cond)
103
 
104
  if "special_characters_ratio" in columns:
105
+ cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
106
  cutoff_special_characters_ratio = st.sidebar.slider(
107
+ cutoff_def, 0.0, 1.0, 1.0, step=0.01
108
  )
109
  new_key = (
110
  "special_characters_ratio",
 
117
  print_discared_by_cond(cond)
118
 
119
  if "stopwords_ratio" in columns:
120
+ cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
121
  cutoff_stopwords_ratio = st.sidebar.slider(
122
+ cutoff_def, 0.0, 1.0, 0.0, step=0.01
123
  )
124
  new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
125
  keys.append(new_key)
 
128
  print_discared_by_cond(cond)
129
 
130
  if "badwords_ratio" in columns:
131
+ cutoff_def = "If the bad words ratio of a document is higher than this number, the document is removed."
132
  cutoff_badwords_ratio = st.sidebar.slider(
133
+ cutoff_def, 0.0, 1.0, 1.0, step=0.01
134
  )
135
  new_key = ("badwords_ratio", cutoff_badwords_ratio, True)
136
  keys.append(new_key)
 
139
  print_discared_by_cond(cond)
140
 
141
  if "lang_id_score" in columns:
142
+ cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
143
  cutoff_lang_id_score = st.sidebar.slider(
144
+ cutoff_def, 0.0, 1.0, 0.0, step=0.01
145
  )
146
  new_key = ("lang_id_score", cutoff_lang_id_score, False)
147
  keys.append(new_key)
 
150
  print_discared_by_cond(cond)
151
 
152
  if "perplexity_score" in columns:
153
+ cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
154
  max_pp = int(np.max(docs["perplexity_score"])) + 1
155
  cutoff_perplexity_score = st.sidebar.slider(
156
+ cutoff_def, 0, max_pp, max_pp
157
  )
158
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
159
  keys.append(new_key)
 
190
  def filtering_of_words(self):
191
  st.sidebar.subheader("Parameter of the filtering on words")
192
 
193
+ cutoff_def = (
194
+ "If the length of a word is higher than this number, the word is removed."
 
195
  )
196
+ max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
197
+ cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
198
 
199
  incorrect_substrings = st.sidebar.checkbox(
200
+ "Remove words with incorrect substrings."
201
  )
202
 
203
  cond_words = self.words["len_word"] <= cutoff_word
 
282
  )
283
 
284
  def visualization(self):
285
+ self.preamble()
286
  self.open_data()
287
  self.set_title()
288
  self.filtering_of_docs()
 
292
  self.download_data()
293
 
294
 
295
+ path_instructions = "./filtering_pipeline_oscar.pdf"
296
  path_data = "./en_examples_with_stats.json"
297
  lang = "English"
298
  num_docs = 5000
 
300
  max_len_text_display = 10000
301
 
302
  visualization = Visualization(
303
+ path_instructions, path_data, lang, num_docs, num_docs_for_words, max_len_text_display
304
  )
305
  visualization.visualization()
filtering_pipeline_oscar.pdf ADDED
Binary file (196 kB). View file