HugoLaurencon commited on
Commit
2c2527f
·
1 Parent(s): 5d485e5

everything in expanders

Browse files
Files changed (1) hide show
  1. app.py +229 -210
app.py CHANGED
@@ -111,19 +111,24 @@ class Visualization:
111
  self.docs = self.docs_checkpoint
112
 
113
  def set_title(self):
114
- st.title(f"{self.num_docs} {self.lang} documents with their stats.")
115
 
116
  @staticmethod
117
  def plot_hist(dataframe, key, num_bins=50):
118
- checkbox = st.checkbox("Diplay distribution", value=True, key=f"display_distribution_{key[0]}")
 
 
119
  if checkbox:
120
  fig, ax = plt.subplots()
121
  val = dataframe[key[0]].values
122
  if np.median(val) != 0:
123
- val = val[abs(val - np.median(val)) < 9 * np.median(np.absolute(val - np.median(val)))]
 
 
 
124
  ax.hist(val, bins=num_bins, density=True)
125
  ax.set_title(" ".join(key[0].split("_")))
126
- ax.axvline(x=key[1], color='r', linestyle='dashed')
127
  st.pyplot(fig)
128
 
129
  def filtering_of_docs(self):
@@ -273,9 +278,7 @@ class Visualization:
273
  with st.sidebar.expander("Perplexity score"):
274
  cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
275
  max_pp = int(np.max(self.docs["perplexity_score"])) + 1
276
- cutoff_perplexity_score = st.slider(
277
- cutoff_def, 0, max_pp, max_pp
278
- )
279
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
280
  keys.append(new_key)
281
  Visualization.plot_hist(self.docs, new_key)
@@ -291,80 +294,96 @@ class Visualization:
291
  all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
292
  all_conds = np.all(all_conds, axis=0)
293
 
294
- st.header("Filtering on documents")
295
-
296
- def display_dataset(cond, description):
297
- displayed_docs = self.docs.loc[cond]
298
- st.subheader(
299
- f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
300
  )
301
- st.markdown(
302
- "Click on a column to sort by it, place the cursor on the text to display it."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  )
304
- st.dataframe(displayed_docs)
305
 
306
- display_dataset(np.invert(all_conds), "Discarded documents")
 
307
 
308
- # st.subheader("Display discarded documents by filter")
309
- display_discarded_documents_by_filter = st.checkbox(
310
- "Display discarded documents by filter"
311
- )
 
 
312
 
313
- if display_discarded_documents_by_filter:
314
- columns = list(self.docs)
 
 
 
 
315
 
316
- if "number_words" in columns:
317
- cond_filter = np.invert(np.all(conds["number_words"], axis=0))
318
- display_dataset(
319
- cond_filter,
320
- "Discarded documents for the filter on the number of words",
321
- )
 
 
322
 
323
- if "repetitions_ratio" in columns:
324
- cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
325
- display_dataset(
326
- cond_filter,
327
- "Discarded documents for the filter on the repetitions ratio",
328
- )
329
 
330
- if "special_characters_ratio" in columns:
331
- cond_filter = np.invert(
332
- np.all(conds["special_characters_ratio"], axis=0)
333
- )
334
- display_dataset(
335
- cond_filter,
336
- "Discarded documents for the filter on the special characters ratio",
337
- )
338
 
339
- if "stopwords_ratio" in columns:
340
- cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
341
- display_dataset(
342
- cond_filter,
343
- "Discarded documents for the filter on the stop words ratio",
344
- )
345
 
346
- if "flagged_words_ratio" in columns:
347
- cond_filter = np.invert(np.all(conds["flagged_words_ratio"], axis=0))
348
- display_dataset(
349
- cond_filter,
350
- "Discarded documents for the filter on the flagged words ratio",
351
- )
352
 
353
- if "lang_id_score" in columns:
354
- cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
355
- display_dataset(
356
- cond_filter,
357
- "Discarded documents for the filter on the language identification confidence score",
358
- )
359
 
360
- if "perplexity_score" in columns:
361
- cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
362
- display_dataset(
363
- cond_filter,
364
- "Discarded documents for the filter on the perplexity score",
365
- )
366
 
367
- display_dataset(all_conds, "Retained documents")
 
 
 
 
 
368
 
369
  def filtering_of_words(self):
370
  if not (self.words is None):
@@ -386,32 +405,39 @@ class Visualization:
386
 
387
  cond_words = self.words["len_word"] <= cutoff_word
388
  if incorrect_substrings:
389
- cond_words = cond_words & np.invert(self.words["incorrect_substring"])
 
 
390
 
391
- st.header("Filtering on words")
 
 
 
 
 
392
 
393
- st.markdown(
394
- f"Since the number of words is way larger than the number of documents, "
395
- f"we consider in this section words for the first {self.num_docs_for_words} documents only."
396
- )
397
 
398
- discarded_words = self.words.loc[np.invert(cond_words)]
399
- st.subheader(
400
- f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
401
- )
402
- st.markdown(
403
- "Click on a column to sort by it, place the cursor on the text to display it."
404
- )
405
- st.dataframe(discarded_words)
406
 
407
- retained_words = self.words.loc[cond_words]
408
- st.subheader(
409
- f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
410
- )
411
- st.markdown(
412
- "Click on a column to sort by it, place the cursor on the text to display it."
413
- )
414
- st.dataframe(retained_words)
415
 
416
  def download_parameters(self):
417
  st.sidebar.subheader("Download parameters")
@@ -421,6 +447,7 @@ class Visualization:
421
  file_name=f"parameters_{self.lang_dataset_id}.json",
422
  )
423
 
 
424
  def plot_zipf_law(self):
425
  if not (self.words is None):
426
  st.header("Zipf's Law")
@@ -441,144 +468,136 @@ class Visualization:
441
  ax.set_xlabel("$i$-th most frequent word")
442
  ax.set_ylabel("frequency in the documents")
443
  st.pyplot(fig)
 
444
 
445
  def analyse_personal_doc(self):
446
- st.header("Analyse your own document")
 
447
 
448
- personal_doc = st.text_area(
449
- label="Paste here the document you want to analyse",
450
- value="",
451
- max_chars=10000,
452
- )
453
 
454
- is_discarded = False
455
 
456
- def is_doc_discarded(key, score):
457
- if key[2]: # max cutoff
458
- return score > key[1]
459
- else:
460
- return score < key[1]
461
 
462
- if personal_doc:
463
 
464
- st.markdown("Statistics of the document:")
465
 
466
- for key in self.keys:
467
- if key[0] == "number_words":
468
- words = ModifyingDocuments.get_words_from_document(
469
- personal_doc,
470
- self.sentencepiece_model_tok,
471
- lower_case=False,
472
- strip_characters=self.param["strip_characters"],
473
- )
474
- if key[2]:
475
- st.markdown(f"Number of words: {len(words)}")
476
- if is_doc_discarded(key, len(words)):
477
- is_discarded = True
478
-
479
- elif key[0] == "repetitions_ratio":
480
- repetitions_ratio = Filtering.compute_repetitions_ratio(
481
- personal_doc, int(key[3])
482
- )
483
- repetitions_ratio = round(repetitions_ratio, 3)
484
- st.markdown(f"Repetitions ratio: {repetitions_ratio}")
485
- if is_doc_discarded(key, repetitions_ratio):
486
- is_discarded = True
487
-
488
- elif key[0] == "special_characters_ratio":
489
- special_characters_ratio = (
490
- Filtering.compute_special_characters_ratio(
491
- personal_doc, self.param["special_characters"]
492
  )
493
- )
494
- special_characters_ratio = round(special_characters_ratio, 3)
495
- st.markdown(f"Special characters ratio: {special_characters_ratio}")
496
- if is_doc_discarded(key, special_characters_ratio):
497
- is_discarded = True
498
-
499
- elif key[0] == "stopwords_ratio":
500
- stopwords_ratio = Filtering.compute_stopwords_ratio(
501
- personal_doc,
502
- self.sentencepiece_model_tok,
503
- self.param["strip_characters"],
504
- self.param["cond_words_augmentation"],
505
- self.param["words_augmentation_group_sizes"],
506
- self.param["words_augmentation_join_char"],
507
- self.stopwords,
508
- )
509
- stopwords_ratio = round(stopwords_ratio, 3)
510
- st.markdown(f"Stop words ratio: {stopwords_ratio}")
511
- if is_doc_discarded(key, stopwords_ratio):
512
- is_discarded = True
513
-
514
- elif key[0] == "flagged_words_ratio":
515
- flagged_words_ratio = Filtering.compute_flagged_words_ratio(
516
- personal_doc,
517
- self.sentencepiece_model_tok,
518
- self.param["strip_characters"],
519
- self.param["cond_words_augmentation"],
520
- self.param["words_augmentation_group_sizes"],
521
- self.param["words_augmentation_join_char"],
522
- self.flagged_words,
523
- )
524
- flagged_words_ratio = round(flagged_words_ratio, 3)
525
- st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
526
- if is_doc_discarded(key, flagged_words_ratio):
527
- is_discarded = True
528
-
529
- elif key[0] == "lang_id_score":
530
- (
531
- lang_pred_dataset_id,
532
- lang_id_score,
533
- ) = Filtering.compute_lang_id_pred_score(
534
- personal_doc, self.model_lang_id
535
- )
536
- lang_id_score = round(lang_id_score, 3)
537
- st.markdown(
538
- f"Language identification confidence score: {lang_id_score}"
539
- )
540
- if is_doc_discarded(key, flagged_words_ratio) or (
541
- self.lang_dataset_id != lang_pred_dataset_id
542
- ):
543
- is_discarded = True
544
-
545
- elif key[0] == "perplexity_score":
546
- perplexity_score = Filtering.compute_perplexity_score(
547
- personal_doc,
548
- self.sentencepiece_model,
549
- self.kenlm_model,
550
- )
551
- perplexity_score = round(perplexity_score, 3)
552
- st.markdown(f"Perplexity score: {perplexity_score}")
553
- if is_doc_discarded(key, perplexity_score):
554
- is_discarded = True
555
-
556
- is_discarded = "" if is_discarded else "not "
557
- st.markdown(
558
- f"With the current filtering parameters, this document **is {is_discarded}discarded**."
559
- )
560
-
561
- def download_data(self):
562
- st.header("Download data")
563
-
564
- with open(self.path_data) as json_file:
565
- btn = st.download_button(
566
- label="Download data as json",
567
- data=json_file,
568
- file_name="data.json",
569
- )
 
 
 
 
 
 
 
 
 
 
 
570
 
571
  def visualization(self):
572
- self.warning_preamble()
573
  self.preamble()
574
  self.open_data()
575
  self.set_title()
576
  self.filtering_of_docs()
577
  self.filtering_of_words()
578
  self.download_parameters()
579
- # self.plot_zipf_law()
580
  self.analyse_personal_doc()
581
- self.download_data()
582
 
583
 
584
  path_instructions = "./explanation_filtering_pipeline.pdf"
 
111
  self.docs = self.docs_checkpoint
112
 
113
  def set_title(self):
114
+ st.title(f"Filtering visualization")
115
 
116
  @staticmethod
117
  def plot_hist(dataframe, key, num_bins=50):
118
+ checkbox = st.checkbox(
119
+ "Diplay distribution", value=True, key=f"display_distribution_{key[0]}"
120
+ )
121
  if checkbox:
122
  fig, ax = plt.subplots()
123
  val = dataframe[key[0]].values
124
  if np.median(val) != 0:
125
+ val = val[
126
+ abs(val - np.median(val))
127
+ < 9 * np.median(np.absolute(val - np.median(val)))
128
+ ]
129
  ax.hist(val, bins=num_bins, density=True)
130
  ax.set_title(" ".join(key[0].split("_")))
131
+ ax.axvline(x=key[1], color="r", linestyle="dashed")
132
  st.pyplot(fig)
133
 
134
  def filtering_of_docs(self):
 
278
  with st.sidebar.expander("Perplexity score"):
279
  cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
280
  max_pp = int(np.max(self.docs["perplexity_score"])) + 1
281
+ cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
 
 
282
  new_key = ("perplexity_score", cutoff_perplexity_score, True)
283
  keys.append(new_key)
284
  Visualization.plot_hist(self.docs, new_key)
 
294
  all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
295
  all_conds = np.all(all_conds, axis=0)
296
 
297
+ with st.expander(
298
+ f"Filtering on documents, for {self.num_docs} {self.lang} documents"
299
+ ):
300
+ st.header(
301
+ f"Filtering on documents, for {self.num_docs} {self.lang} documents"
 
302
  )
303
+
304
+ def display_dataset(cond, description):
305
+ displayed_docs = self.docs.loc[cond]
306
+ st.subheader(
307
+ f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
308
+ )
309
+ st.markdown(
310
+ "Click on a column to sort by it, place the cursor on the text to display it."
311
+ )
312
+ st.dataframe(displayed_docs)
313
+
314
+ display_dataset(np.invert(all_conds), "Discarded documents")
315
+
316
+ # st.subheader("Display discarded documents by filter")
317
+ display_discarded_documents_by_filter = st.checkbox(
318
+ "Display discarded documents by filter"
319
  )
 
320
 
321
+ if display_discarded_documents_by_filter:
322
+ columns = list(self.docs)
323
 
324
+ if "number_words" in columns:
325
+ cond_filter = np.invert(np.all(conds["number_words"], axis=0))
326
+ display_dataset(
327
+ cond_filter,
328
+ "Discarded documents for the filter on the number of words",
329
+ )
330
 
331
+ if "repetitions_ratio" in columns:
332
+ cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
333
+ display_dataset(
334
+ cond_filter,
335
+ "Discarded documents for the filter on the repetitions ratio",
336
+ )
337
 
338
+ if "special_characters_ratio" in columns:
339
+ cond_filter = np.invert(
340
+ np.all(conds["special_characters_ratio"], axis=0)
341
+ )
342
+ display_dataset(
343
+ cond_filter,
344
+ "Discarded documents for the filter on the special characters ratio",
345
+ )
346
 
347
+ if "stopwords_ratio" in columns:
348
+ cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
349
+ display_dataset(
350
+ cond_filter,
351
+ "Discarded documents for the filter on the stop words ratio",
352
+ )
353
 
354
+ if "flagged_words_ratio" in columns:
355
+ cond_filter = np.invert(
356
+ np.all(conds["flagged_words_ratio"], axis=0)
357
+ )
358
+ display_dataset(
359
+ cond_filter,
360
+ "Discarded documents for the filter on the flagged words ratio",
361
+ )
362
 
363
+ if "lang_id_score" in columns:
364
+ cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
365
+ display_dataset(
366
+ cond_filter,
367
+ "Discarded documents for the filter on the language identification confidence score",
368
+ )
369
 
370
+ if "perplexity_score" in columns:
371
+ cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
372
+ display_dataset(
373
+ cond_filter,
374
+ "Discarded documents for the filter on the perplexity score",
375
+ )
376
 
377
+ display_dataset(all_conds, "Retained documents")
 
 
 
 
 
378
 
379
+ st.header("Download data")
 
 
 
 
 
380
 
381
+ with open(self.path_data) as json_file:
382
+ btn = st.download_button(
383
+ label="Download data as json",
384
+ data=json_file,
385
+ file_name="data.json",
386
+ )
387
 
388
  def filtering_of_words(self):
389
  if not (self.words is None):
 
405
 
406
  cond_words = self.words["len_word"] <= cutoff_word
407
  if incorrect_substrings:
408
+ cond_words = cond_words & np.invert(
409
+ self.words["incorrect_substring"]
410
+ )
411
 
412
+ with st.expander(
413
+ f"Filtering on words, for {self.num_docs} {self.lang} documents"
414
+ ):
415
+ st.header(
416
+ f"Filtering on words, for {self.num_docs} {self.lang} documents"
417
+ )
418
 
419
+ st.markdown(
420
+ f"Since the number of words is way larger than the number of documents, "
421
+ f"we consider in this section words for the first {self.num_docs_for_words} documents only."
422
+ )
423
 
424
+ discarded_words = self.words.loc[np.invert(cond_words)]
425
+ st.subheader(
426
+ f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
427
+ )
428
+ st.markdown(
429
+ "Click on a column to sort by it, place the cursor on the text to display it."
430
+ )
431
+ st.dataframe(discarded_words)
432
 
433
+ retained_words = self.words.loc[cond_words]
434
+ st.subheader(
435
+ f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
436
+ )
437
+ st.markdown(
438
+ "Click on a column to sort by it, place the cursor on the text to display it."
439
+ )
440
+ st.dataframe(retained_words)
441
 
442
  def download_parameters(self):
443
  st.sidebar.subheader("Download parameters")
 
447
  file_name=f"parameters_{self.lang_dataset_id}.json",
448
  )
449
 
450
+ """
451
  def plot_zipf_law(self):
452
  if not (self.words is None):
453
  st.header("Zipf's Law")
 
468
  ax.set_xlabel("$i$-th most frequent word")
469
  ax.set_ylabel("frequency in the documents")
470
  st.pyplot(fig)
471
+ """
472
 
473
  def analyse_personal_doc(self):
474
+ with st.expander("Analyse your own document"):
475
+ st.header("Analyse your own document")
476
 
477
+ personal_doc = st.text_area(
478
+ label="Paste here the document you want to analyse",
479
+ value="",
480
+ max_chars=10000,
481
+ )
482
 
483
+ is_discarded = False
484
 
485
+ def is_doc_discarded(key, score):
486
+ if key[2]: # max cutoff
487
+ return score > key[1]
488
+ else:
489
+ return score < key[1]
490
 
491
+ if personal_doc:
492
 
493
+ st.markdown("Statistics of the document:")
494
 
495
+ for key in self.keys:
496
+ if key[0] == "number_words":
497
+ words = ModifyingDocuments.get_words_from_document(
498
+ personal_doc,
499
+ self.sentencepiece_model_tok,
500
+ lower_case=False,
501
+ strip_characters=self.param["strip_characters"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  )
503
+ if key[2]:
504
+ st.markdown(f"Number of words: {len(words)}")
505
+ if is_doc_discarded(key, len(words)):
506
+ is_discarded = True
507
+
508
+ elif key[0] == "repetitions_ratio":
509
+ repetitions_ratio = Filtering.compute_repetitions_ratio(
510
+ personal_doc, int(key[3])
511
+ )
512
+ repetitions_ratio = round(repetitions_ratio, 3)
513
+ st.markdown(f"Repetitions ratio: {repetitions_ratio}")
514
+ if is_doc_discarded(key, repetitions_ratio):
515
+ is_discarded = True
516
+
517
+ elif key[0] == "special_characters_ratio":
518
+ special_characters_ratio = (
519
+ Filtering.compute_special_characters_ratio(
520
+ personal_doc, self.param["special_characters"]
521
+ )
522
+ )
523
+ special_characters_ratio = round(special_characters_ratio, 3)
524
+ st.markdown(
525
+ f"Special characters ratio: {special_characters_ratio}"
526
+ )
527
+ if is_doc_discarded(key, special_characters_ratio):
528
+ is_discarded = True
529
+
530
+ elif key[0] == "stopwords_ratio":
531
+ stopwords_ratio = Filtering.compute_stopwords_ratio(
532
+ personal_doc,
533
+ self.sentencepiece_model_tok,
534
+ self.param["strip_characters"],
535
+ self.param["cond_words_augmentation"],
536
+ self.param["words_augmentation_group_sizes"],
537
+ self.param["words_augmentation_join_char"],
538
+ self.stopwords,
539
+ )
540
+ stopwords_ratio = round(stopwords_ratio, 3)
541
+ st.markdown(f"Stop words ratio: {stopwords_ratio}")
542
+ if is_doc_discarded(key, stopwords_ratio):
543
+ is_discarded = True
544
+
545
+ elif key[0] == "flagged_words_ratio":
546
+ flagged_words_ratio = Filtering.compute_flagged_words_ratio(
547
+ personal_doc,
548
+ self.sentencepiece_model_tok,
549
+ self.param["strip_characters"],
550
+ self.param["cond_words_augmentation"],
551
+ self.param["words_augmentation_group_sizes"],
552
+ self.param["words_augmentation_join_char"],
553
+ self.flagged_words,
554
+ )
555
+ flagged_words_ratio = round(flagged_words_ratio, 3)
556
+ st.markdown(f"Flagged words ratio: {flagged_words_ratio}")
557
+ if is_doc_discarded(key, flagged_words_ratio):
558
+ is_discarded = True
559
+
560
+ elif key[0] == "lang_id_score":
561
+ (
562
+ lang_pred_dataset_id,
563
+ lang_id_score,
564
+ ) = Filtering.compute_lang_id_pred_score(
565
+ personal_doc, self.model_lang_id
566
+ )
567
+ lang_id_score = round(lang_id_score, 3)
568
+ st.markdown(
569
+ f"Language identification confidence score: {lang_id_score}"
570
+ )
571
+ if is_doc_discarded(key, flagged_words_ratio) or (
572
+ self.lang_dataset_id != lang_pred_dataset_id
573
+ ):
574
+ is_discarded = True
575
+
576
+ elif key[0] == "perplexity_score":
577
+ perplexity_score = Filtering.compute_perplexity_score(
578
+ personal_doc,
579
+ self.sentencepiece_model,
580
+ self.kenlm_model,
581
+ )
582
+ perplexity_score = round(perplexity_score, 3)
583
+ st.markdown(f"Perplexity score: {perplexity_score}")
584
+ if is_doc_discarded(key, perplexity_score):
585
+ is_discarded = True
586
+
587
+ is_discarded = "" if is_discarded else "not "
588
+ st.markdown(
589
+ f"With the current filtering parameters, this document **is {is_discarded}discarded**."
590
+ )
591
 
592
  def visualization(self):
593
+ # self.warning_preamble()
594
  self.preamble()
595
  self.open_data()
596
  self.set_title()
597
  self.filtering_of_docs()
598
  self.filtering_of_words()
599
  self.download_parameters()
 
600
  self.analyse_personal_doc()
 
601
 
602
 
603
  path_instructions = "./explanation_filtering_pipeline.pdf"