meg-huggingface commited on
Commit
5d4982b
·
2 Parent(s): 58471d2 e122a90

Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main

Browse files
app.py CHANGED
@@ -157,6 +157,8 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
157
  dstats.load_or_prepare_text_duplicates()
158
  dstats.load_or_prepare_npmi()
159
  dstats.load_or_prepare_zipf()
 
 
160
 
161
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
162
  """
 
157
  dstats.load_or_prepare_text_duplicates()
158
  dstats.load_or_prepare_npmi()
159
  dstats.load_or_prepare_zipf()
160
+ # Don't recalculate; we're live
161
+ dstats.set_deployment(True)
162
 
163
  def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
164
  """
data_measurements/dataset_statistics.py CHANGED
@@ -299,6 +299,15 @@ class DatasetStatisticsCacheClass:
299
  # Needed for UI
300
  self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
301
 
 
 
 
 
 
 
 
 
 
302
  def get_base_dataset(self):
303
  """Gets a pointer to the truncated base dataset object."""
304
  if not self.dset:
@@ -378,31 +387,34 @@ class DatasetStatisticsCacheClass:
378
  write_json(self.length_stats_dict, self.length_stats_json_fid)
379
 
380
  def prepare_length_df(self):
381
- if self.tokenized_df is None:
382
- self.tokenized_df = self.do_tokenization()
383
- self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
384
- TOKENIZED_FIELD].apply(len)
385
- self.length_df = self.tokenized_df[
386
- [LENGTH_FIELD, OUR_TEXT_FIELD]].sort_values(
387
- by=[LENGTH_FIELD], ascending=True
388
- )
 
389
 
390
  def prepare_text_length_stats(self):
391
- if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
392
- self.prepare_length_df()
393
- avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
394
- self.avg_length = round(avg_length, 1)
395
- std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
396
- self.std_length = round(std_length, 1)
397
- self.num_uniq_lengths = len(self.length_df["length"].unique())
398
- self.length_stats_dict = {"avg length": self.avg_length,
399
- "std length": self.std_length,
400
- "num lengths": self.num_uniq_lengths}
 
401
 
402
  def prepare_fig_text_lengths(self):
403
- if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
404
- self.prepare_length_df()
405
- self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
 
406
 
407
  def load_or_prepare_embeddings(self, save=True):
408
  if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
@@ -489,39 +501,41 @@ class DatasetStatisticsCacheClass:
489
  self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
490
 
491
  def prepare_general_stats(self):
492
- if self.tokenized_df is None:
493
- logs.warning("Tokenized dataset not yet loaded; doing so.")
494
- self.load_or_prepare_dataset()
495
- if self.vocab_counts_df is None:
496
- logs.warning("Vocab not yet loaded; doing so.")
497
- self.load_or_prepare_vocab()
498
- self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
499
- "count", ascending=False
500
- ).head(_TOP_N)
501
- self.total_words = len(self.vocab_counts_df)
502
- self.total_open_words = len(self.vocab_counts_filtered_df)
503
- self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
504
- self.prepare_text_duplicates()
505
- self.dedup_total = sum(self.dup_counts_df[CNT])
506
- self.general_stats_dict = {
507
- TOT_WORDS: self.total_words,
508
- TOT_OPEN_WORDS: self.total_open_words,
509
- TEXT_NAN_CNT: self.text_nan_count,
510
- DEDUP_TOT: self.dedup_total,
511
- }
 
512
 
513
  def prepare_text_duplicates(self):
514
- if self.tokenized_df is None:
515
- self.load_or_prepare_tokenized_df()
516
- dup_df = self.tokenized_df[
517
- self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
518
- self.dup_counts_df = pd.DataFrame(
519
- dup_df.pivot_table(
520
- columns=[OUR_TEXT_FIELD], aggfunc="size"
521
- ).sort_values(ascending=False),
522
- columns=[CNT],
523
- )
524
- self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
 
525
 
526
  def load_or_prepare_dataset(self, save=True):
527
  """
@@ -557,12 +571,13 @@ class DatasetStatisticsCacheClass:
557
  if (self.use_cache and exists(self.tokenized_df_fid)):
558
  self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
559
  else:
560
- # tokenize all text instances
561
- self.tokenized_df = self.do_tokenization()
562
- if save:
563
- logs.warning("Saving tokenized dataset to disk")
564
- # save tokenized text
565
- write_df(self.tokenized_df, self.tokenized_df_fid)
 
566
 
567
  def load_or_prepare_text_dset(self, save=True):
568
  if (self.use_cache and exists(self.text_dset_fid)):
@@ -572,22 +587,24 @@ class DatasetStatisticsCacheClass:
572
  logs.info(self.text_dset)
573
  # ...Or load it from the server and store it anew
574
  else:
575
- self.prepare_text_dset()
576
- if save:
577
- # save extracted text instances
578
- logs.warning("Saving dataset to disk")
579
- self.text_dset.save_to_disk(self.text_dset_fid)
 
580
 
581
  def prepare_text_dset(self):
582
- self.get_base_dataset()
583
- # extract all text instances
584
- self.text_dset = self.dset.map(
585
- lambda examples: extract_field(
586
- examples, self.text_field, OUR_TEXT_FIELD
587
- ),
588
- batched=True,
589
- remove_columns=list(self.dset.features),
590
- )
 
591
 
592
  def do_tokenization(self):
593
  """
@@ -646,25 +663,27 @@ class DatasetStatisticsCacheClass:
646
  if save:
647
  write_plotly(self.fig_labels, self.fig_labels_json_fid)
648
  else:
649
- self.prepare_labels()
650
- if save:
651
- # save extracted label instances
652
- self.label_dset.save_to_disk(self.label_dset_fid)
653
- write_plotly(self.fig_labels, self.fig_labels_json_fid)
 
654
 
655
  def prepare_labels(self):
656
- self.get_base_dataset()
657
- self.label_dset = self.dset.map(
658
- lambda examples: extract_field(
659
- examples, self.label_field, OUR_LABEL_FIELD
660
- ),
661
- batched=True,
662
- remove_columns=list(self.dset.features),
663
- )
664
- self.label_df = self.label_dset.to_pandas()
665
- self.fig_labels = make_fig_labels(
666
- self.label_df, self.label_names, OUR_LABEL_FIELD
667
- )
 
668
 
669
  def load_or_prepare_npmi(self):
670
  self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
@@ -708,6 +727,7 @@ class nPMIStatisticsCacheClass:
708
  by calling the nPMI class with the user's selections."""
709
 
710
  def __init__(self, dataset_stats, use_cache=False):
 
711
  self.dstats = dataset_stats
712
  self.pmi_cache_path = pjoin(self.dstats.cache_path, "pmi_files")
713
  if not isdir(self.pmi_cache_path):
@@ -784,16 +804,17 @@ class nPMIStatisticsCacheClass:
784
  joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
785
  # When maybe some things have been computed for the selected subgroups.
786
  else:
787
- logs.info("Preparing new joint npmi")
788
- joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
789
- subgroup_pair, subgroup_files
790
- )
791
- # Cache new results
792
- logs.info("Writing out.")
793
- for subgroup in subgroup_pair:
794
- write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
795
- with open(joint_npmi_fid, "w+") as f:
796
- joint_npmi_df.to_csv(f)
 
797
  logs.info("The joint npmi df is")
798
  logs.info(joint_npmi_df)
799
  return joint_npmi_df
 
299
  # Needed for UI
300
  self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
301
 
302
+ self.live = False
303
+
304
+ def set_deployment(self, live=True):
305
+ """
306
+ Function that we can hit when we deploy, so that cache files are not
307
+ written out/recalculated, but instead that part of the UI can be punted.
308
+ """
309
+ self.live = live
310
+
311
  def get_base_dataset(self):
312
  """Gets a pointer to the truncated base dataset object."""
313
  if not self.dset:
 
387
  write_json(self.length_stats_dict, self.length_stats_json_fid)
388
 
389
  def prepare_length_df(self):
390
+ if not self.live:
391
+ if self.tokenized_df is None:
392
+ self.tokenized_df = self.do_tokenization()
393
+ self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
394
+ TOKENIZED_FIELD].apply(len)
395
+ self.length_df = self.tokenized_df[
396
+ [LENGTH_FIELD, OUR_TEXT_FIELD]].sort_values(
397
+ by=[LENGTH_FIELD], ascending=True
398
+ )
399
 
400
  def prepare_text_length_stats(self):
401
+ if not self.live:
402
+ if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
403
+ self.prepare_length_df()
404
+ avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
405
+ self.avg_length = round(avg_length, 1)
406
+ std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
407
+ self.std_length = round(std_length, 1)
408
+ self.num_uniq_lengths = len(self.length_df["length"].unique())
409
+ self.length_stats_dict = {"avg length": self.avg_length,
410
+ "std length": self.std_length,
411
+ "num lengths": self.num_uniq_lengths}
412
 
413
  def prepare_fig_text_lengths(self):
414
+ if not self.live:
415
+ if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
416
+ self.prepare_length_df()
417
+ self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
418
 
419
  def load_or_prepare_embeddings(self, save=True):
420
  if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
 
501
  self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
502
 
503
  def prepare_general_stats(self):
504
+ if not self.live:
505
+ if self.tokenized_df is None:
506
+ logs.warning("Tokenized dataset not yet loaded; doing so.")
507
+ self.load_or_prepare_dataset()
508
+ if self.vocab_counts_df is None:
509
+ logs.warning("Vocab not yet loaded; doing so.")
510
+ self.load_or_prepare_vocab()
511
+ self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
512
+ "count", ascending=False
513
+ ).head(_TOP_N)
514
+ self.total_words = len(self.vocab_counts_df)
515
+ self.total_open_words = len(self.vocab_counts_filtered_df)
516
+ self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
517
+ self.prepare_text_duplicates()
518
+ self.dedup_total = sum(self.dup_counts_df[CNT])
519
+ self.general_stats_dict = {
520
+ TOT_WORDS: self.total_words,
521
+ TOT_OPEN_WORDS: self.total_open_words,
522
+ TEXT_NAN_CNT: self.text_nan_count,
523
+ DEDUP_TOT: self.dedup_total,
524
+ }
525
 
526
  def prepare_text_duplicates(self):
527
+ if not self.live:
528
+ if self.tokenized_df is None:
529
+ self.load_or_prepare_tokenized_df()
530
+ dup_df = self.tokenized_df[
531
+ self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
532
+ self.dup_counts_df = pd.DataFrame(
533
+ dup_df.pivot_table(
534
+ columns=[OUR_TEXT_FIELD], aggfunc="size"
535
+ ).sort_values(ascending=False),
536
+ columns=[CNT],
537
+ )
538
+ self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
539
 
540
  def load_or_prepare_dataset(self, save=True):
541
  """
 
571
  if (self.use_cache and exists(self.tokenized_df_fid)):
572
  self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
573
  else:
574
+ if not self.live:
575
+ # tokenize all text instances
576
+ self.tokenized_df = self.do_tokenization()
577
+ if save:
578
+ logs.warning("Saving tokenized dataset to disk")
579
+ # save tokenized text
580
+ write_df(self.tokenized_df, self.tokenized_df_fid)
581
 
582
  def load_or_prepare_text_dset(self, save=True):
583
  if (self.use_cache and exists(self.text_dset_fid)):
 
587
  logs.info(self.text_dset)
588
  # ...Or load it from the server and store it anew
589
  else:
590
+ if not self.live:
591
+ self.prepare_text_dset()
592
+ if save:
593
+ # save extracted text instances
594
+ logs.warning("Saving dataset to disk")
595
+ self.text_dset.save_to_disk(self.text_dset_fid)
596
 
597
  def prepare_text_dset(self):
598
+ if not self.live:
599
+ self.get_base_dataset()
600
+ # extract all text instances
601
+ self.text_dset = self.dset.map(
602
+ lambda examples: extract_field(
603
+ examples, self.text_field, OUR_TEXT_FIELD
604
+ ),
605
+ batched=True,
606
+ remove_columns=list(self.dset.features),
607
+ )
608
 
609
  def do_tokenization(self):
610
  """
 
663
  if save:
664
  write_plotly(self.fig_labels, self.fig_labels_json_fid)
665
  else:
666
+ if not self.live:
667
+ self.prepare_labels()
668
+ if save:
669
+ # save extracted label instances
670
+ self.label_dset.save_to_disk(self.label_dset_fid)
671
+ write_plotly(self.fig_labels, self.fig_labels_json_fid)
672
 
673
  def prepare_labels(self):
674
+ if not self.live:
675
+ self.get_base_dataset()
676
+ self.label_dset = self.dset.map(
677
+ lambda examples: extract_field(
678
+ examples, self.label_field, OUR_LABEL_FIELD
679
+ ),
680
+ batched=True,
681
+ remove_columns=list(self.dset.features),
682
+ )
683
+ self.label_df = self.label_dset.to_pandas()
684
+ self.fig_labels = make_fig_labels(
685
+ self.label_df, self.label_names, OUR_LABEL_FIELD
686
+ )
687
 
688
  def load_or_prepare_npmi(self):
689
  self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
 
727
  by calling the nPMI class with the user's selections."""
728
 
729
  def __init__(self, dataset_stats, use_cache=False):
730
+ self.live = dataset_stats.live
731
  self.dstats = dataset_stats
732
  self.pmi_cache_path = pjoin(self.dstats.cache_path, "pmi_files")
733
  if not isdir(self.pmi_cache_path):
 
804
  joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
805
  # When maybe some things have been computed for the selected subgroups.
806
  else:
807
+ if not self.live:
808
+ logs.info("Preparing new joint npmi")
809
+ joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
810
+ subgroup_pair, subgroup_files
811
+ )
812
+ # Cache new results
813
+ logs.info("Writing out.")
814
+ for subgroup in subgroup_pair:
815
+ write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
816
+ with open(joint_npmi_fid, "w+") as f:
817
+ joint_npmi_df.to_csv(f)
818
  logs.info("The joint npmi df is")
819
  logs.info(joint_npmi_df)
820
  return joint_npmi_df
data_measurements/streamlit_utils.py CHANGED
@@ -178,7 +178,11 @@ def expander_text_lengths(dstats, column_id):
178
  value=0,
179
  step=1,
180
  )
181
- st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
 
 
 
 
182
 
183
 
184
  ### Third, use a sentence embedding model
@@ -273,7 +277,7 @@ def expander_text_duplicates(dstats, column_id):
273
  st.write(
274
  "### Here is the list of all the duplicated items and their counts in your dataset:"
275
  )
276
- if dstats.dup_counts_df is None:
277
  st.write("There are no duplicates in this dataset! 🥳")
278
  else:
279
  gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
 
178
  value=0,
179
  step=1,
180
  )
181
+
182
+ # This is quite a large file and is breaking our ability to navigate the app development.
183
+ # Just passing if it's not already there for launch v0
184
+ if dstats.length_df is not None:
185
+ st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
186
 
187
 
188
  ### Third, use a sentence embedding model
 
277
  st.write(
278
  "### Here is the list of all the duplicated items and their counts in your dataset:"
279
  )
280
+ if dstats.dup_counts_df is None or dstats.dup_counts_df.empty:
281
  st.write("There are no duplicates in this dataset! 🥳")
282
  else:
283
  gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)