HugoLaurencon commited on
Commit
693f997
·
1 Parent(s): 611e98e

filter on repetition removal

Browse files
app.py CHANGED
@@ -7,6 +7,7 @@ import os
7
  import base64
8
  import json
9
  import pandas as pd
 
10
 
11
  import numpy as np
12
 
@@ -32,7 +33,7 @@ class Visualization:
32
 
33
  def preamble(self):
34
  st.markdown(
35
- "Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail."
36
  )
37
 
38
  def get_binary_file_downloader_html(bin_file, file_label="File"):
@@ -45,7 +46,7 @@ class Visualization:
45
  st.markdown(
46
  get_binary_file_downloader_html(
47
  self.path_instructions,
48
- "Download the filtering pipeline of OSCAR as pdf",
49
  ),
50
  unsafe_allow_html=True,
51
  )
@@ -73,16 +74,17 @@ class Visualization:
73
  doc["text"][: self.max_len_text_display]
74
  + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
75
  )
76
- self.docs = pd.DataFrame(docs)
 
77
 
78
  def set_title(self):
79
- st.title(f"{self.num_docs} {self.lang} documents from OSCAR with their stats.")
80
 
81
  def filtering_of_docs(self):
82
  st.sidebar.subheader("Parameters of the filtering on documents")
83
 
84
- def set_sliders(docs):
85
- columns = list(docs)
86
  keys = []
87
  conds = {}
88
 
@@ -99,7 +101,7 @@ class Visualization:
99
 
100
  if "number_words" in columns:
101
  cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
102
- max_nb_words = int(np.max(docs["number_words"])) + 1
103
  cutoff_min_number_words = st.sidebar.slider(
104
  cutoff_def, 0, min(max_nb_words, 500), 0
105
  )
@@ -119,6 +121,46 @@ class Visualization:
119
 
120
  conds["number_words"] = [cond_1, cond_2]
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  if "special_characters_ratio" in columns:
123
  cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
124
  cutoff_special_characters_ratio = st.sidebar.slider(
@@ -169,7 +211,7 @@ class Visualization:
169
 
170
  if "perplexity_score" in columns:
171
  cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
172
- max_pp = int(np.max(docs["perplexity_score"])) + 1
173
  cutoff_perplexity_score = st.sidebar.slider(
174
  cutoff_def, 0, max_pp, max_pp
175
  )
@@ -181,7 +223,7 @@ class Visualization:
181
 
182
  return keys, conds
183
 
184
- self.keys, conds = set_sliders(self.docs)
185
 
186
  all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
187
  all_conds = np.all(all_conds, axis=0)
@@ -215,6 +257,13 @@ class Visualization:
215
  "Discarded documents for the filter on the number of words",
216
  )
217
 
 
 
 
 
 
 
 
218
  if "special_characters_ratio" in columns:
219
  cond_filter = np.invert(
220
  np.all(conds["special_characters_ratio"], axis=0)
@@ -360,9 +409,9 @@ class Visualization:
360
  self.download_data()
361
 
362
 
363
- path_instructions = "./filtering_pipeline_oscar.pdf"
364
- path_data = "./zh_examples_with_stats.json"
365
- lang = "Chinese"
366
  num_docs = 5000
367
  num_docs_for_words = 500
368
  max_len_text_display = 10000
 
7
  import base64
8
  import json
9
  import pandas as pd
10
+ pd.options.mode.chained_assignment = None
11
 
12
  import numpy as np
13
 
 
33
 
34
  def preamble(self):
35
  st.markdown(
36
+ "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
37
  )
38
 
39
  def get_binary_file_downloader_html(bin_file, file_label="File"):
 
46
  st.markdown(
47
  get_binary_file_downloader_html(
48
  self.path_instructions,
49
+ "Download the explanation of the filtering pipeline as pdf",
50
  ),
51
  unsafe_allow_html=True,
52
  )
 
74
  doc["text"][: self.max_len_text_display]
75
  + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
76
  )
77
+ self.docs_checkpoint = pd.DataFrame(docs)
78
+ self.docs = self.docs_checkpoint
79
 
80
  def set_title(self):
81
+ st.title(f"{self.num_docs} {self.lang} documents with their stats.")
82
 
83
  def filtering_of_docs(self):
84
  st.sidebar.subheader("Parameters of the filtering on documents")
85
 
86
+ def set_sliders():
87
+ columns = list(self.docs)
88
  keys = []
89
  conds = {}
90
 
 
101
 
102
  if "number_words" in columns:
103
  cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
104
+ max_nb_words = int(np.max(self.docs["number_words"])) + 1
105
  cutoff_min_number_words = st.sidebar.slider(
106
  cutoff_def, 0, min(max_nb_words, 500), 0
107
  )
 
121
 
122
  conds["number_words"] = [cond_1, cond_2]
123
 
124
+ if "repetitions_ratio" in columns:
125
+ val_repetitions_lengths = list(
126
+ self.docs["repetitions_ratio"].iloc[0].keys()
127
+ )
128
+ default_index = (
129
+ val_repetitions_lengths.index("10")
130
+ if "10" in val_repetitions_lengths
131
+ else 0
132
+ )
133
+ label_selectbox = (
134
+ "Length of the repetitions (that will determine the repetitions ratio). "
135
+ "Choosing a higher or lower number does not mean that the filtering "
136
+ "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
137
+ "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
138
+ "few or no repetitions, simply because their length gives them more diversity, and we do "
139
+ "not want to discard such documents."
140
+ )
141
+ repetitions_length = st.sidebar.selectbox(
142
+ label=label_selectbox,
143
+ options=val_repetitions_lengths,
144
+ index=default_index,
145
+ )
146
+ self.docs = self.docs_checkpoint
147
+ for i in range(len(self.docs["repetitions_ratio"])):
148
+ self.docs["repetitions_ratio"].iloc[i] = self.docs["repetitions_ratio"].iloc[i][repetitions_length]
149
+
150
+ cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
151
+ cutoff_repetitions_ratio = st.sidebar.slider(
152
+ cutoff_def, 0.0, 1.0, 1.0, step=0.01
153
+ )
154
+ new_key = (
155
+ "repetitions_ratio",
156
+ cutoff_repetitions_ratio,
157
+ True,
158
+ )
159
+ keys.append(new_key)
160
+ cond = get_cond(new_key[0], new_key[1], new_key[2])
161
+ print_discared_by_cond(cond)
162
+ conds["repetitions_ratio"] = [cond]
163
+
164
  if "special_characters_ratio" in columns:
165
  cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
166
  cutoff_special_characters_ratio = st.sidebar.slider(
 
211
 
212
  if "perplexity_score" in columns:
213
  cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
214
+ max_pp = int(np.max(self.docs["perplexity_score"])) + 1
215
  cutoff_perplexity_score = st.sidebar.slider(
216
  cutoff_def, 0, max_pp, max_pp
217
  )
 
223
 
224
  return keys, conds
225
 
226
+ self.keys, conds = set_sliders()
227
 
228
  all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
229
  all_conds = np.all(all_conds, axis=0)
 
257
  "Discarded documents for the filter on the number of words",
258
  )
259
 
260
+ if "repetitions_ratio" in columns:
261
+ cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
262
+ display_dataset(
263
+ cond_filter,
264
+ "Discarded documents for the filter on the repetitions ratio",
265
+ )
266
+
267
  if "special_characters_ratio" in columns:
268
  cond_filter = np.invert(
269
  np.all(conds["special_characters_ratio"], axis=0)
 
409
  self.download_data()
410
 
411
 
412
+ path_instructions = "./explanation_filtering_pipeline.pdf"
413
+ path_data = "./en_examples_with_stats.json"
414
+ lang = "English"
415
  num_docs = 5000
416
  num_docs_for_words = 500
417
  max_len_text_display = 10000
en_examples_with_stats.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2325873414309a7ea67d2753202207a2773319dc40f338c0a0fc7bb703463a6
3
- size 713107133
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00106fc2a9d51bbc78ce1ca2d05f2f402bf927a1f741f6c092b3f17cb9f16801
3
+ size 237353442
explanation_filtering_pipeline.pdf ADDED
Binary file (216 kB). View file
 
filtering_pipeline_oscar.pdf DELETED
Binary file (196 kB)
 
zh_examples_with_stats.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:438a5bb757c23581784946f345a99ab11b77c43f57a3cbf18148c197ec4ef741
3
- size 193517532