Spaces:
Runtime error
Runtime error
HugoLaurencon
commited on
Commit
·
693f997
1
Parent(s):
611e98e
filter on repetition removal
Browse files- app.py +61 -12
- en_examples_with_stats.json +2 -2
- explanation_filtering_pipeline.pdf +0 -0
- filtering_pipeline_oscar.pdf +0 -0
- zh_examples_with_stats.json +0 -3
app.py
CHANGED
@@ -7,6 +7,7 @@ import os
|
|
7 |
import base64
|
8 |
import json
|
9 |
import pandas as pd
|
|
|
10 |
|
11 |
import numpy as np
|
12 |
|
@@ -32,7 +33,7 @@ class Visualization:
|
|
32 |
|
33 |
def preamble(self):
|
34 |
st.markdown(
|
35 |
-
"Before diving into this demo, you might want to take a look at how the filtering pipeline
|
36 |
)
|
37 |
|
38 |
def get_binary_file_downloader_html(bin_file, file_label="File"):
|
@@ -45,7 +46,7 @@ class Visualization:
|
|
45 |
st.markdown(
|
46 |
get_binary_file_downloader_html(
|
47 |
self.path_instructions,
|
48 |
-
"Download the filtering pipeline
|
49 |
),
|
50 |
unsafe_allow_html=True,
|
51 |
)
|
@@ -73,16 +74,17 @@ class Visualization:
|
|
73 |
doc["text"][: self.max_len_text_display]
|
74 |
+ " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
|
75 |
)
|
76 |
-
self.
|
|
|
77 |
|
78 |
def set_title(self):
|
79 |
-
st.title(f"{self.num_docs} {self.lang} documents
|
80 |
|
81 |
def filtering_of_docs(self):
|
82 |
st.sidebar.subheader("Parameters of the filtering on documents")
|
83 |
|
84 |
-
def set_sliders(
|
85 |
-
columns = list(docs)
|
86 |
keys = []
|
87 |
conds = {}
|
88 |
|
@@ -99,7 +101,7 @@ class Visualization:
|
|
99 |
|
100 |
if "number_words" in columns:
|
101 |
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
|
102 |
-
max_nb_words = int(np.max(docs["number_words"])) + 1
|
103 |
cutoff_min_number_words = st.sidebar.slider(
|
104 |
cutoff_def, 0, min(max_nb_words, 500), 0
|
105 |
)
|
@@ -119,6 +121,46 @@ class Visualization:
|
|
119 |
|
120 |
conds["number_words"] = [cond_1, cond_2]
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
if "special_characters_ratio" in columns:
|
123 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
124 |
cutoff_special_characters_ratio = st.sidebar.slider(
|
@@ -169,7 +211,7 @@ class Visualization:
|
|
169 |
|
170 |
if "perplexity_score" in columns:
|
171 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
172 |
-
max_pp = int(np.max(docs["perplexity_score"])) + 1
|
173 |
cutoff_perplexity_score = st.sidebar.slider(
|
174 |
cutoff_def, 0, max_pp, max_pp
|
175 |
)
|
@@ -181,7 +223,7 @@ class Visualization:
|
|
181 |
|
182 |
return keys, conds
|
183 |
|
184 |
-
self.keys, conds = set_sliders(
|
185 |
|
186 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
187 |
all_conds = np.all(all_conds, axis=0)
|
@@ -215,6 +257,13 @@ class Visualization:
|
|
215 |
"Discarded documents for the filter on the number of words",
|
216 |
)
|
217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
if "special_characters_ratio" in columns:
|
219 |
cond_filter = np.invert(
|
220 |
np.all(conds["special_characters_ratio"], axis=0)
|
@@ -360,9 +409,9 @@ class Visualization:
|
|
360 |
self.download_data()
|
361 |
|
362 |
|
363 |
-
path_instructions = "./
|
364 |
-
path_data = "./
|
365 |
-
lang = "
|
366 |
num_docs = 5000
|
367 |
num_docs_for_words = 500
|
368 |
max_len_text_display = 10000
|
|
|
7 |
import base64
|
8 |
import json
|
9 |
import pandas as pd
|
10 |
+
pd.options.mode.chained_assignment = None
|
11 |
|
12 |
import numpy as np
|
13 |
|
|
|
33 |
|
34 |
def preamble(self):
|
35 |
st.markdown(
|
36 |
+
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
|
37 |
)
|
38 |
|
39 |
def get_binary_file_downloader_html(bin_file, file_label="File"):
|
|
|
46 |
st.markdown(
|
47 |
get_binary_file_downloader_html(
|
48 |
self.path_instructions,
|
49 |
+
"Download the explanation of the filtering pipeline as pdf",
|
50 |
),
|
51 |
unsafe_allow_html=True,
|
52 |
)
|
|
|
74 |
doc["text"][: self.max_len_text_display]
|
75 |
+ " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
|
76 |
)
|
77 |
+
self.docs_checkpoint = pd.DataFrame(docs)
|
78 |
+
self.docs = self.docs_checkpoint
|
79 |
|
80 |
def set_title(self):
|
81 |
+
st.title(f"{self.num_docs} {self.lang} documents with their stats.")
|
82 |
|
83 |
def filtering_of_docs(self):
|
84 |
st.sidebar.subheader("Parameters of the filtering on documents")
|
85 |
|
86 |
+
def set_sliders():
|
87 |
+
columns = list(self.docs)
|
88 |
keys = []
|
89 |
conds = {}
|
90 |
|
|
|
101 |
|
102 |
if "number_words" in columns:
|
103 |
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
|
104 |
+
max_nb_words = int(np.max(self.docs["number_words"])) + 1
|
105 |
cutoff_min_number_words = st.sidebar.slider(
|
106 |
cutoff_def, 0, min(max_nb_words, 500), 0
|
107 |
)
|
|
|
121 |
|
122 |
conds["number_words"] = [cond_1, cond_2]
|
123 |
|
124 |
+
if "repetitions_ratio" in columns:
|
125 |
+
val_repetitions_lengths = list(
|
126 |
+
self.docs["repetitions_ratio"].iloc[0].keys()
|
127 |
+
)
|
128 |
+
default_index = (
|
129 |
+
val_repetitions_lengths.index("10")
|
130 |
+
if "10" in val_repetitions_lengths
|
131 |
+
else 0
|
132 |
+
)
|
133 |
+
label_selectbox = (
|
134 |
+
"Length of the repetitions (that will determine the repetitions ratio). "
|
135 |
+
"Choosing a higher or lower number does not mean that the filtering "
|
136 |
+
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
|
137 |
+
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
|
138 |
+
"few or no repetitions, simply because their length gives them more diversity, and we do "
|
139 |
+
"not want to discard such documents."
|
140 |
+
)
|
141 |
+
repetitions_length = st.sidebar.selectbox(
|
142 |
+
label=label_selectbox,
|
143 |
+
options=val_repetitions_lengths,
|
144 |
+
index=default_index,
|
145 |
+
)
|
146 |
+
self.docs = self.docs_checkpoint
|
147 |
+
for i in range(len(self.docs["repetitions_ratio"])):
|
148 |
+
self.docs["repetitions_ratio"].iloc[i] = self.docs["repetitions_ratio"].iloc[i][repetitions_length]
|
149 |
+
|
150 |
+
cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
|
151 |
+
cutoff_repetitions_ratio = st.sidebar.slider(
|
152 |
+
cutoff_def, 0.0, 1.0, 1.0, step=0.01
|
153 |
+
)
|
154 |
+
new_key = (
|
155 |
+
"repetitions_ratio",
|
156 |
+
cutoff_repetitions_ratio,
|
157 |
+
True,
|
158 |
+
)
|
159 |
+
keys.append(new_key)
|
160 |
+
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
161 |
+
print_discared_by_cond(cond)
|
162 |
+
conds["repetitions_ratio"] = [cond]
|
163 |
+
|
164 |
if "special_characters_ratio" in columns:
|
165 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
166 |
cutoff_special_characters_ratio = st.sidebar.slider(
|
|
|
211 |
|
212 |
if "perplexity_score" in columns:
|
213 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
214 |
+
max_pp = int(np.max(self.docs["perplexity_score"])) + 1
|
215 |
cutoff_perplexity_score = st.sidebar.slider(
|
216 |
cutoff_def, 0, max_pp, max_pp
|
217 |
)
|
|
|
223 |
|
224 |
return keys, conds
|
225 |
|
226 |
+
self.keys, conds = set_sliders()
|
227 |
|
228 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
229 |
all_conds = np.all(all_conds, axis=0)
|
|
|
257 |
"Discarded documents for the filter on the number of words",
|
258 |
)
|
259 |
|
260 |
+
if "repetitions_ratio" in columns:
|
261 |
+
cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
|
262 |
+
display_dataset(
|
263 |
+
cond_filter,
|
264 |
+
"Discarded documents for the filter on the repetitions ratio",
|
265 |
+
)
|
266 |
+
|
267 |
if "special_characters_ratio" in columns:
|
268 |
cond_filter = np.invert(
|
269 |
np.all(conds["special_characters_ratio"], axis=0)
|
|
|
409 |
self.download_data()
|
410 |
|
411 |
|
412 |
+
path_instructions = "./explanation_filtering_pipeline.pdf"
|
413 |
+
path_data = "./en_examples_with_stats.json"
|
414 |
+
lang = "English"
|
415 |
num_docs = 5000
|
416 |
num_docs_for_words = 500
|
417 |
max_len_text_display = 10000
|
en_examples_with_stats.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00106fc2a9d51bbc78ce1ca2d05f2f402bf927a1f741f6c092b3f17cb9f16801
|
3 |
+
size 237353442
|
explanation_filtering_pipeline.pdf
ADDED
Binary file (216 kB). View file
|
|
filtering_pipeline_oscar.pdf
DELETED
Binary file (196 kB)
|
|
zh_examples_with_stats.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:438a5bb757c23581784946f345a99ab11b77c43f57a3cbf18148c197ec4ef741
|
3 |
-
size 193517532
|
|
|
|
|
|
|
|