Spaces:
Runtime error
Runtime error
change ord
Browse files
app.py
CHANGED
@@ -4,12 +4,11 @@ import os
|
|
4 |
|
5 |
enable_xorbits = False
|
6 |
|
7 |
-
|
8 |
if enable_xorbits:
|
9 |
import xorbits.pandas as pd
|
10 |
import xorbits.numpy as np
|
11 |
import xorbits
|
12 |
-
xorbits.init(
|
13 |
else:
|
14 |
import pandas as pd
|
15 |
import numpy as np
|
@@ -69,7 +68,7 @@ with st.spinner('Loading meta'):
|
|
69 |
sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
|
70 |
|
71 |
tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
|
72 |
-
["Introduction", "Junk Data🤖", "
|
73 |
with tab0:
|
74 |
|
75 |
st.markdown(
|
@@ -205,7 +204,120 @@ This piece of Python code calculated a measure of "impurity" in text documents,
|
|
205 |
)
|
206 |
|
207 |
|
208 |
-
with tab2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
st.header('Contamination')
|
210 |
|
211 |
st.markdown('''
|
@@ -309,117 +421,6 @@ data was heavily used in their benchmark datasets.
|
|
309 |
'''
|
310 |
)
|
311 |
|
312 |
-
with tab3:
|
313 |
-
st.header("Too-Short Documents")
|
314 |
-
|
315 |
-
st.markdown('''
|
316 |
-
The aim of language modeling is to master the generation of text based on preceding tokens.
|
317 |
-
In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately
|
318 |
-
100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to
|
319 |
-
model dependencies within the text.
|
320 |
-
|
321 |
-
|
322 |
-
Use the Hugging Face Transformers library to tokenize text and then calculate the proportion
|
323 |
-
of documents that are "too short" in a dataset. This example converts text into tokens that the BERT
|
324 |
-
model can understand. Choose a tokenizer for your model.
|
325 |
-
''')
|
326 |
-
metrics, code = st.tabs(['Metrics', 'Code'])
|
327 |
-
|
328 |
-
with metrics:
|
329 |
-
with st.spinner('Calculating too-short ratio...'):
|
330 |
-
from transformers import BertTokenizer
|
331 |
-
|
332 |
-
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
333 |
-
|
334 |
-
df = datasets['train']
|
335 |
-
# Create a new column with the number of tokens for each text
|
336 |
-
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
|
337 |
-
total_num_docs = len(df)
|
338 |
-
too_short_docs = len(df[df['text_length'] < 100])
|
339 |
-
too_short_doc_ratio = too_short_docs / total_num_docs
|
340 |
-
|
341 |
-
col1, col2, col3 = st.columns(3)
|
342 |
-
col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs)
|
343 |
-
col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
|
344 |
-
col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100))
|
345 |
-
|
346 |
-
# col1, _ = st.columns([2, 1])
|
347 |
-
|
348 |
-
# import seaborn as sns
|
349 |
-
# import matplotlib.pyplot as plt
|
350 |
-
# fig, ax = plt.subplots(figsize=(10, 5))
|
351 |
-
# ax.set_title('Distribution of text length (in tokens)')
|
352 |
-
# sns.histplot(data=df, x='text_length', ax=ax)
|
353 |
-
# plt.axvline(100, color='r', linestyle='--')
|
354 |
-
# col1.pyplot(fig)
|
355 |
-
with code:
|
356 |
-
st.code(
|
357 |
-
'''
|
358 |
-
from transformers import BertTokenizer
|
359 |
-
|
360 |
-
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
361 |
-
|
362 |
-
df = datasets['train']
|
363 |
-
# Create a new column with the number of tokens for each text
|
364 |
-
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
|
365 |
-
total_num_docs = len(df)
|
366 |
-
too_short_docs = len(df[df['text_length'] < 100])
|
367 |
-
too_short_doc_ratio = too_short_docs / total_num_docs
|
368 |
-
'''
|
369 |
-
)
|
370 |
-
|
371 |
-
with tab4:
|
372 |
-
st.header('Toxic Content')
|
373 |
-
st.markdown('''
|
374 |
-
It is crucial in the training of language models to be vigilant and potentially apply tools
|
375 |
-
to exclude toxic content from the pre-training datasets. This practice helps to
|
376 |
-
prevent the models from demonstrating bias or generating detrimental content in subsequent applications.
|
377 |
-
|
378 |
-
One approach to address this issue is by scanning the text for **offensive words**.
|
379 |
-
For instance, the creators of the C4 dataset have implemented such a
|
380 |
-
filtering mechanism. The follow code references this
|
381 |
-
[word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source.
|
382 |
-
|
383 |
-
The following code utilizes the word list to quantify the "biased content ratio" in the dataset.
|
384 |
-
|
385 |
-
''')
|
386 |
-
|
387 |
-
metrics, code = st.tabs(['Metrics', 'Code'])
|
388 |
-
with metrics:
|
389 |
-
with st.spinner('Calculating toxic ratio...'):
|
390 |
-
df = datasets['train']
|
391 |
-
|
392 |
-
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
|
393 |
-
lines = f.readlines()
|
394 |
-
|
395 |
-
banned_words = [line.rstrip('\n') for line in lines]
|
396 |
-
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
|
397 |
-
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
|
398 |
-
total_num_docs = len(df)
|
399 |
-
biased_num_docs = df['matches'].sum()
|
400 |
-
biased_content_ratio = biased_num_docs / total_num_docs
|
401 |
-
col1, col2, col3 = st.columns(3)
|
402 |
-
|
403 |
-
col1.metric(label="Total Doc Count", value="%d" % total_num_docs)
|
404 |
-
col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs)
|
405 |
-
col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100))
|
406 |
-
st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20])
|
407 |
-
with code:
|
408 |
-
st.code(
|
409 |
-
'''
|
410 |
-
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
|
411 |
-
lines = f.readlines()
|
412 |
-
|
413 |
-
banned_words = [line.rstrip('\n') for line in lines]
|
414 |
-
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
|
415 |
-
total_num_docs = len(df)
|
416 |
-
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
|
417 |
-
biased_num_docs = df['matches'].sum()
|
418 |
-
biased_content_ratio = biased_num_docs / total_num_docs
|
419 |
-
'''
|
420 |
-
)
|
421 |
-
|
422 |
-
|
423 |
|
424 |
with tab5:
|
425 |
st.header("Duplication")
|
|
|
4 |
|
5 |
enable_xorbits = False
|
6 |
|
|
|
7 |
if enable_xorbits:
|
8 |
import xorbits.pandas as pd
|
9 |
import xorbits.numpy as np
|
10 |
import xorbits
|
11 |
+
xorbits.init()
|
12 |
else:
|
13 |
import pandas as pd
|
14 |
import numpy as np
|
|
|
68 |
sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
|
69 |
|
70 |
tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
|
71 |
+
["Introduction", "Junk Data🤖", "Short Documents🌐", "Biased Content🛡️", "Contamination🧹", "Duplication🔍"])
|
72 |
with tab0:
|
73 |
|
74 |
st.markdown(
|
|
|
204 |
)
|
205 |
|
206 |
|
207 |
+
with tab2:
|
208 |
+
st.header('Toxic Content')
|
209 |
+
st.markdown('''
|
210 |
+
It is crucial in the training of language models to be vigilant and potentially apply tools
|
211 |
+
to exclude toxic content from the pre-training datasets. This practice helps to
|
212 |
+
prevent the models from demonstrating bias or generating detrimental content in subsequent applications.
|
213 |
+
|
214 |
+
One approach to address this issue is by scanning the text for **offensive words**.
|
215 |
+
For instance, the creators of the C4 dataset have implemented such a
|
216 |
+
filtering mechanism. The follow code references this
|
217 |
+
[word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source.
|
218 |
+
|
219 |
+
The following code utilizes the word list to quantify the "biased content ratio" in the dataset.
|
220 |
+
|
221 |
+
''')
|
222 |
+
|
223 |
+
metrics, code = st.tabs(['Metrics', 'Code'])
|
224 |
+
with metrics:
|
225 |
+
with st.spinner('Calculating toxic ratio...'):
|
226 |
+
df = datasets['train']
|
227 |
+
|
228 |
+
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
|
229 |
+
lines = f.readlines()
|
230 |
+
|
231 |
+
banned_words = [line.rstrip('\n') for line in lines]
|
232 |
+
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
|
233 |
+
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
|
234 |
+
total_num_docs = len(df)
|
235 |
+
biased_num_docs = df['matches'].sum()
|
236 |
+
biased_content_ratio = biased_num_docs / total_num_docs
|
237 |
+
col1, col2, col3 = st.columns(3)
|
238 |
+
|
239 |
+
col1.metric(label="Total Doc Count", value="%d" % total_num_docs)
|
240 |
+
col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs)
|
241 |
+
col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100))
|
242 |
+
st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20])
|
243 |
+
with code:
|
244 |
+
st.code(
|
245 |
+
'''
|
246 |
+
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
|
247 |
+
lines = f.readlines()
|
248 |
+
|
249 |
+
banned_words = [line.rstrip('\n') for line in lines]
|
250 |
+
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
|
251 |
+
total_num_docs = len(df)
|
252 |
+
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
|
253 |
+
biased_num_docs = df['matches'].sum()
|
254 |
+
biased_content_ratio = biased_num_docs / total_num_docs
|
255 |
+
'''
|
256 |
+
)
|
257 |
+
|
258 |
+
|
259 |
+
|
260 |
+
with tab3:
|
261 |
+
st.header("Too-Short Documents")
|
262 |
+
|
263 |
+
st.markdown('''
|
264 |
+
The aim of language modeling is to master the generation of text based on preceding tokens.
|
265 |
+
In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately
|
266 |
+
100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to
|
267 |
+
model dependencies within the text.
|
268 |
+
|
269 |
+
|
270 |
+
Use the Hugging Face Transformers library to tokenize text and then calculate the proportion
|
271 |
+
of documents that are "too short" in a dataset. This example converts text into tokens that the BERT
|
272 |
+
model can understand. Choose a tokenizer for your model.
|
273 |
+
''')
|
274 |
+
metrics, code = st.tabs(['Metrics', 'Code'])
|
275 |
+
|
276 |
+
with metrics:
|
277 |
+
with st.spinner('Calculating too-short ratio...'):
|
278 |
+
from transformers import BertTokenizer
|
279 |
+
|
280 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
281 |
+
|
282 |
+
df = datasets['train']
|
283 |
+
# Create a new column with the number of tokens for each text
|
284 |
+
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
|
285 |
+
total_num_docs = len(df)
|
286 |
+
too_short_docs = len(df[df['text_length'] < 100])
|
287 |
+
too_short_doc_ratio = too_short_docs / total_num_docs
|
288 |
+
|
289 |
+
col1, col2, col3 = st.columns(3)
|
290 |
+
col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs)
|
291 |
+
col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
|
292 |
+
col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100))
|
293 |
+
|
294 |
+
# col1, _ = st.columns([2, 1])
|
295 |
+
|
296 |
+
# import seaborn as sns
|
297 |
+
# import matplotlib.pyplot as plt
|
298 |
+
# fig, ax = plt.subplots(figsize=(10, 5))
|
299 |
+
# ax.set_title('Distribution of text length (in tokens)')
|
300 |
+
# sns.histplot(data=df, x='text_length', ax=ax)
|
301 |
+
# plt.axvline(100, color='r', linestyle='--')
|
302 |
+
# col1.pyplot(fig)
|
303 |
+
with code:
|
304 |
+
st.code(
|
305 |
+
'''
|
306 |
+
from transformers import BertTokenizer
|
307 |
+
|
308 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
309 |
+
|
310 |
+
df = datasets['train']
|
311 |
+
# Create a new column with the number of tokens for each text
|
312 |
+
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
|
313 |
+
total_num_docs = len(df)
|
314 |
+
too_short_docs = len(df[df['text_length'] < 100])
|
315 |
+
too_short_doc_ratio = too_short_docs / total_num_docs
|
316 |
+
'''
|
317 |
+
)
|
318 |
+
|
319 |
+
|
320 |
+
with tab4:
|
321 |
st.header('Contamination')
|
322 |
|
323 |
st.markdown('''
|
|
|
421 |
'''
|
422 |
)
|
423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
|
425 |
with tab5:
|
426 |
st.header("Duplication")
|