Dreamsome commited on
Commit
e8ec88b
·
1 Parent(s): 4d9ad4a

change ord

Browse files
Files changed (1) hide show
  1. app.py +116 -115
app.py CHANGED
@@ -4,12 +4,11 @@ import os
4
 
5
  enable_xorbits = False
6
 
7
-
8
  if enable_xorbits:
9
  import xorbits.pandas as pd
10
  import xorbits.numpy as np
11
  import xorbits
12
- xorbits.init(n_worker=1, n_cpu=2)
13
  else:
14
  import pandas as pd
15
  import numpy as np
@@ -69,7 +68,7 @@ with st.spinner('Loading meta'):
69
  sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
70
 
71
  tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
72
- ["Introduction", "Junk Data🤖", "Contamination🧹", "Short Documents🌐", "Biased Content🛡️", "Duplication🔍"])
73
  with tab0:
74
 
75
  st.markdown(
@@ -205,7 +204,120 @@ This piece of Python code calculated a measure of "impurity" in text documents,
205
  )
206
 
207
 
208
- with tab2:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  st.header('Contamination')
210
 
211
  st.markdown('''
@@ -309,117 +421,6 @@ data was heavily used in their benchmark datasets.
309
  '''
310
  )
311
 
312
- with tab3:
313
- st.header("Too-Short Documents")
314
-
315
- st.markdown('''
316
- The aim of language modeling is to master the generation of text based on preceding tokens.
317
- In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately
318
- 100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to
319
- model dependencies within the text.
320
-
321
-
322
- Use the Hugging Face Transformers library to tokenize text and then calculate the proportion
323
- of documents that are "too short" in a dataset. This example converts text into tokens that the BERT
324
- model can understand. Choose a tokenizer for your model.
325
- ''')
326
- metrics, code = st.tabs(['Metrics', 'Code'])
327
-
328
- with metrics:
329
- with st.spinner('Calculating too-short ratio...'):
330
- from transformers import BertTokenizer
331
-
332
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
333
-
334
- df = datasets['train']
335
- # Create a new column with the number of tokens for each text
336
- df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
337
- total_num_docs = len(df)
338
- too_short_docs = len(df[df['text_length'] < 100])
339
- too_short_doc_ratio = too_short_docs / total_num_docs
340
-
341
- col1, col2, col3 = st.columns(3)
342
- col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs)
343
- col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
344
- col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100))
345
-
346
- # col1, _ = st.columns([2, 1])
347
-
348
- # import seaborn as sns
349
- # import matplotlib.pyplot as plt
350
- # fig, ax = plt.subplots(figsize=(10, 5))
351
- # ax.set_title('Distribution of text length (in tokens)')
352
- # sns.histplot(data=df, x='text_length', ax=ax)
353
- # plt.axvline(100, color='r', linestyle='--')
354
- # col1.pyplot(fig)
355
- with code:
356
- st.code(
357
- '''
358
- from transformers import BertTokenizer
359
-
360
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
361
-
362
- df = datasets['train']
363
- # Create a new column with the number of tokens for each text
364
- df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
365
- total_num_docs = len(df)
366
- too_short_docs = len(df[df['text_length'] < 100])
367
- too_short_doc_ratio = too_short_docs / total_num_docs
368
- '''
369
- )
370
-
371
- with tab4:
372
- st.header('Toxic Content')
373
- st.markdown('''
374
- It is crucial in the training of language models to be vigilant and potentially apply tools
375
- to exclude toxic content from the pre-training datasets. This practice helps to
376
- prevent the models from demonstrating bias or generating detrimental content in subsequent applications.
377
-
378
- One approach to address this issue is by scanning the text for **offensive words**.
379
- For instance, the creators of the C4 dataset have implemented such a
380
- filtering mechanism. The follow code references this
381
- [word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source.
382
-
383
- The following code utilizes the word list to quantify the "biased content ratio" in the dataset.
384
-
385
- ''')
386
-
387
- metrics, code = st.tabs(['Metrics', 'Code'])
388
- with metrics:
389
- with st.spinner('Calculating toxic ratio...'):
390
- df = datasets['train']
391
-
392
- with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
393
- lines = f.readlines()
394
-
395
- banned_words = [line.rstrip('\n') for line in lines]
396
- df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
397
- df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
398
- total_num_docs = len(df)
399
- biased_num_docs = df['matches'].sum()
400
- biased_content_ratio = biased_num_docs / total_num_docs
401
- col1, col2, col3 = st.columns(3)
402
-
403
- col1.metric(label="Total Doc Count", value="%d" % total_num_docs)
404
- col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs)
405
- col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100))
406
- st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20])
407
- with code:
408
- st.code(
409
- '''
410
- with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
411
- lines = f.readlines()
412
-
413
- banned_words = [line.rstrip('\n') for line in lines]
414
- df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
415
- total_num_docs = len(df)
416
- df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
417
- biased_num_docs = df['matches'].sum()
418
- biased_content_ratio = biased_num_docs / total_num_docs
419
- '''
420
- )
421
-
422
-
423
 
424
  with tab5:
425
  st.header("Duplication")
 
4
 
5
  enable_xorbits = False
6
 
 
7
  if enable_xorbits:
8
  import xorbits.pandas as pd
9
  import xorbits.numpy as np
10
  import xorbits
11
+ xorbits.init()
12
  else:
13
  import pandas as pd
14
  import numpy as np
 
68
  sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
69
 
70
  tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
71
+ ["Introduction", "Junk Data🤖", "Short Documents🌐", "Biased Content🛡️", "Contamination🧹", "Duplication🔍"])
72
  with tab0:
73
 
74
  st.markdown(
 
204
  )
205
 
206
 
207
+ with tab2:
208
+ st.header('Toxic Content')
209
+ st.markdown('''
210
+ It is crucial in the training of language models to be vigilant and potentially apply tools
211
+ to exclude toxic content from the pre-training datasets. This practice helps to
212
+ prevent the models from demonstrating bias or generating detrimental content in subsequent applications.
213
+
214
+ One approach to address this issue is by scanning the text for **offensive words**.
215
+ For instance, the creators of the C4 dataset have implemented such a
216
+ filtering mechanism. The follow code references this
217
+ [word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source.
218
+
219
+ The following code utilizes the word list to quantify the "biased content ratio" in the dataset.
220
+
221
+ ''')
222
+
223
+ metrics, code = st.tabs(['Metrics', 'Code'])
224
+ with metrics:
225
+ with st.spinner('Calculating toxic ratio...'):
226
+ df = datasets['train']
227
+
228
+ with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
229
+ lines = f.readlines()
230
+
231
+ banned_words = [line.rstrip('\n') for line in lines]
232
+ df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
233
+ df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
234
+ total_num_docs = len(df)
235
+ biased_num_docs = df['matches'].sum()
236
+ biased_content_ratio = biased_num_docs / total_num_docs
237
+ col1, col2, col3 = st.columns(3)
238
+
239
+ col1.metric(label="Total Doc Count", value="%d" % total_num_docs)
240
+ col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs)
241
+ col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100))
242
+ st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20])
243
+ with code:
244
+ st.code(
245
+ '''
246
+ with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
247
+ lines = f.readlines()
248
+
249
+ banned_words = [line.rstrip('\n') for line in lines]
250
+ df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
251
+ total_num_docs = len(df)
252
+ df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
253
+ biased_num_docs = df['matches'].sum()
254
+ biased_content_ratio = biased_num_docs / total_num_docs
255
+ '''
256
+ )
257
+
258
+
259
+
260
+ with tab3:
261
+ st.header("Too-Short Documents")
262
+
263
+ st.markdown('''
264
+ The aim of language modeling is to master the generation of text based on preceding tokens.
265
+ In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately
266
+ 100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to
267
+ model dependencies within the text.
268
+
269
+
270
+ Use the Hugging Face Transformers library to tokenize text and then calculate the proportion
271
+ of documents that are "too short" in a dataset. This example converts text into tokens that the BERT
272
+ model can understand. Choose a tokenizer for your model.
273
+ ''')
274
+ metrics, code = st.tabs(['Metrics', 'Code'])
275
+
276
+ with metrics:
277
+ with st.spinner('Calculating too-short ratio...'):
278
+ from transformers import BertTokenizer
279
+
280
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
281
+
282
+ df = datasets['train']
283
+ # Create a new column with the number of tokens for each text
284
+ df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
285
+ total_num_docs = len(df)
286
+ too_short_docs = len(df[df['text_length'] < 100])
287
+ too_short_doc_ratio = too_short_docs / total_num_docs
288
+
289
+ col1, col2, col3 = st.columns(3)
290
+ col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs)
291
+ col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
292
+ col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100))
293
+
294
+ # col1, _ = st.columns([2, 1])
295
+
296
+ # import seaborn as sns
297
+ # import matplotlib.pyplot as plt
298
+ # fig, ax = plt.subplots(figsize=(10, 5))
299
+ # ax.set_title('Distribution of text length (in tokens)')
300
+ # sns.histplot(data=df, x='text_length', ax=ax)
301
+ # plt.axvline(100, color='r', linestyle='--')
302
+ # col1.pyplot(fig)
303
+ with code:
304
+ st.code(
305
+ '''
306
+ from transformers import BertTokenizer
307
+
308
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
309
+
310
+ df = datasets['train']
311
+ # Create a new column with the number of tokens for each text
312
+ df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
313
+ total_num_docs = len(df)
314
+ too_short_docs = len(df[df['text_length'] < 100])
315
+ too_short_doc_ratio = too_short_docs / total_num_docs
316
+ '''
317
+ )
318
+
319
+
320
+ with tab4:
321
  st.header('Contamination')
322
 
323
  st.markdown('''
 
421
  '''
422
  )
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
  with tab5:
426
  st.header("Duplication")