Spaces:
Build error
Build error
Merge branch 'main' of https://huggingface.co/spaces/huggingface/data-measurements-tool-2 into main
Browse files- app.py +2 -0
- data_measurements/dataset_statistics.py +120 -99
- data_measurements/streamlit_utils.py +6 -2
app.py
CHANGED
@@ -157,6 +157,8 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
157 |
dstats.load_or_prepare_text_duplicates()
|
158 |
dstats.load_or_prepare_npmi()
|
159 |
dstats.load_or_prepare_zipf()
|
|
|
|
|
160 |
|
161 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
|
162 |
"""
|
|
|
157 |
dstats.load_or_prepare_text_duplicates()
|
158 |
dstats.load_or_prepare_npmi()
|
159 |
dstats.load_or_prepare_zipf()
|
160 |
+
# Don't recalculate; we're live
|
161 |
+
dstats.set_deployment(True)
|
162 |
|
163 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
|
164 |
"""
|
data_measurements/dataset_statistics.py
CHANGED
@@ -299,6 +299,15 @@ class DatasetStatisticsCacheClass:
|
|
299 |
# Needed for UI
|
300 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
def get_base_dataset(self):
|
303 |
"""Gets a pointer to the truncated base dataset object."""
|
304 |
if not self.dset:
|
@@ -378,31 +387,34 @@ class DatasetStatisticsCacheClass:
|
|
378 |
write_json(self.length_stats_dict, self.length_stats_json_fid)
|
379 |
|
380 |
def prepare_length_df(self):
|
381 |
-
if self.
|
382 |
-
self.tokenized_df
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
|
|
389 |
|
390 |
def prepare_text_length_stats(self):
|
391 |
-
if
|
392 |
-
self.
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
|
|
401 |
|
402 |
def prepare_fig_text_lengths(self):
|
403 |
-
if
|
404 |
-
self.
|
405 |
-
|
|
|
406 |
|
407 |
def load_or_prepare_embeddings(self, save=True):
|
408 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
|
@@ -489,39 +501,41 @@ class DatasetStatisticsCacheClass:
|
|
489 |
self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
|
490 |
|
491 |
def prepare_general_stats(self):
|
492 |
-
if self.
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
|
|
512 |
|
513 |
def prepare_text_duplicates(self):
|
514 |
-
if self.
|
515 |
-
self.
|
516 |
-
|
517 |
-
self.tokenized_df
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
|
|
525 |
|
526 |
def load_or_prepare_dataset(self, save=True):
|
527 |
"""
|
@@ -557,12 +571,13 @@ class DatasetStatisticsCacheClass:
|
|
557 |
if (self.use_cache and exists(self.tokenized_df_fid)):
|
558 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
559 |
else:
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
|
|
566 |
|
567 |
def load_or_prepare_text_dset(self, save=True):
|
568 |
if (self.use_cache and exists(self.text_dset_fid)):
|
@@ -572,22 +587,24 @@ class DatasetStatisticsCacheClass:
|
|
572 |
logs.info(self.text_dset)
|
573 |
# ...Or load it from the server and store it anew
|
574 |
else:
|
575 |
-
self.
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
|
|
580 |
|
581 |
def prepare_text_dset(self):
|
582 |
-
self.
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
examples
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
|
|
591 |
|
592 |
def do_tokenization(self):
|
593 |
"""
|
@@ -646,25 +663,27 @@ class DatasetStatisticsCacheClass:
|
|
646 |
if save:
|
647 |
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
648 |
else:
|
649 |
-
self.
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
|
|
654 |
|
655 |
def prepare_labels(self):
|
656 |
-
self.
|
657 |
-
|
658 |
-
|
659 |
-
examples
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
self.
|
667 |
-
|
|
|
668 |
|
669 |
def load_or_prepare_npmi(self):
|
670 |
self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
|
@@ -708,6 +727,7 @@ class nPMIStatisticsCacheClass:
|
|
708 |
by calling the nPMI class with the user's selections."""
|
709 |
|
710 |
def __init__(self, dataset_stats, use_cache=False):
|
|
|
711 |
self.dstats = dataset_stats
|
712 |
self.pmi_cache_path = pjoin(self.dstats.cache_path, "pmi_files")
|
713 |
if not isdir(self.pmi_cache_path):
|
@@ -784,16 +804,17 @@ class nPMIStatisticsCacheClass:
|
|
784 |
joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
|
785 |
# When maybe some things have been computed for the selected subgroups.
|
786 |
else:
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
|
|
797 |
logs.info("The joint npmi df is")
|
798 |
logs.info(joint_npmi_df)
|
799 |
return joint_npmi_df
|
|
|
299 |
# Needed for UI
|
300 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
301 |
|
302 |
+
self.live = False
|
303 |
+
|
304 |
+
def set_deployment(self, live=True):
|
305 |
+
"""
|
306 |
+
Function that we can hit when we deploy, so that cache files are not
|
307 |
+
written out/recalculated, but instead that part of the UI can be punted.
|
308 |
+
"""
|
309 |
+
self.live = live
|
310 |
+
|
311 |
def get_base_dataset(self):
|
312 |
"""Gets a pointer to the truncated base dataset object."""
|
313 |
if not self.dset:
|
|
|
387 |
write_json(self.length_stats_dict, self.length_stats_json_fid)
|
388 |
|
389 |
def prepare_length_df(self):
|
390 |
+
if not self.live:
|
391 |
+
if self.tokenized_df is None:
|
392 |
+
self.tokenized_df = self.do_tokenization()
|
393 |
+
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[
|
394 |
+
TOKENIZED_FIELD].apply(len)
|
395 |
+
self.length_df = self.tokenized_df[
|
396 |
+
[LENGTH_FIELD, OUR_TEXT_FIELD]].sort_values(
|
397 |
+
by=[LENGTH_FIELD], ascending=True
|
398 |
+
)
|
399 |
|
400 |
def prepare_text_length_stats(self):
|
401 |
+
if not self.live:
|
402 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
|
403 |
+
self.prepare_length_df()
|
404 |
+
avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
|
405 |
+
self.avg_length = round(avg_length, 1)
|
406 |
+
std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
|
407 |
+
self.std_length = round(std_length, 1)
|
408 |
+
self.num_uniq_lengths = len(self.length_df["length"].unique())
|
409 |
+
self.length_stats_dict = {"avg length": self.avg_length,
|
410 |
+
"std length": self.std_length,
|
411 |
+
"num lengths": self.num_uniq_lengths}
|
412 |
|
413 |
def prepare_fig_text_lengths(self):
|
414 |
+
if not self.live:
|
415 |
+
if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
|
416 |
+
self.prepare_length_df()
|
417 |
+
self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
|
418 |
|
419 |
def load_or_prepare_embeddings(self, save=True):
|
420 |
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
|
|
|
501 |
self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
|
502 |
|
503 |
def prepare_general_stats(self):
|
504 |
+
if not self.live:
|
505 |
+
if self.tokenized_df is None:
|
506 |
+
logs.warning("Tokenized dataset not yet loaded; doing so.")
|
507 |
+
self.load_or_prepare_dataset()
|
508 |
+
if self.vocab_counts_df is None:
|
509 |
+
logs.warning("Vocab not yet loaded; doing so.")
|
510 |
+
self.load_or_prepare_vocab()
|
511 |
+
self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
|
512 |
+
"count", ascending=False
|
513 |
+
).head(_TOP_N)
|
514 |
+
self.total_words = len(self.vocab_counts_df)
|
515 |
+
self.total_open_words = len(self.vocab_counts_filtered_df)
|
516 |
+
self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
|
517 |
+
self.prepare_text_duplicates()
|
518 |
+
self.dedup_total = sum(self.dup_counts_df[CNT])
|
519 |
+
self.general_stats_dict = {
|
520 |
+
TOT_WORDS: self.total_words,
|
521 |
+
TOT_OPEN_WORDS: self.total_open_words,
|
522 |
+
TEXT_NAN_CNT: self.text_nan_count,
|
523 |
+
DEDUP_TOT: self.dedup_total,
|
524 |
+
}
|
525 |
|
526 |
def prepare_text_duplicates(self):
|
527 |
+
if not self.live:
|
528 |
+
if self.tokenized_df is None:
|
529 |
+
self.load_or_prepare_tokenized_df()
|
530 |
+
dup_df = self.tokenized_df[
|
531 |
+
self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
|
532 |
+
self.dup_counts_df = pd.DataFrame(
|
533 |
+
dup_df.pivot_table(
|
534 |
+
columns=[OUR_TEXT_FIELD], aggfunc="size"
|
535 |
+
).sort_values(ascending=False),
|
536 |
+
columns=[CNT],
|
537 |
+
)
|
538 |
+
self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
|
539 |
|
540 |
def load_or_prepare_dataset(self, save=True):
|
541 |
"""
|
|
|
571 |
if (self.use_cache and exists(self.tokenized_df_fid)):
|
572 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
573 |
else:
|
574 |
+
if not self.live:
|
575 |
+
# tokenize all text instances
|
576 |
+
self.tokenized_df = self.do_tokenization()
|
577 |
+
if save:
|
578 |
+
logs.warning("Saving tokenized dataset to disk")
|
579 |
+
# save tokenized text
|
580 |
+
write_df(self.tokenized_df, self.tokenized_df_fid)
|
581 |
|
582 |
def load_or_prepare_text_dset(self, save=True):
|
583 |
if (self.use_cache and exists(self.text_dset_fid)):
|
|
|
587 |
logs.info(self.text_dset)
|
588 |
# ...Or load it from the server and store it anew
|
589 |
else:
|
590 |
+
if not self.live:
|
591 |
+
self.prepare_text_dset()
|
592 |
+
if save:
|
593 |
+
# save extracted text instances
|
594 |
+
logs.warning("Saving dataset to disk")
|
595 |
+
self.text_dset.save_to_disk(self.text_dset_fid)
|
596 |
|
597 |
def prepare_text_dset(self):
|
598 |
+
if not self.live:
|
599 |
+
self.get_base_dataset()
|
600 |
+
# extract all text instances
|
601 |
+
self.text_dset = self.dset.map(
|
602 |
+
lambda examples: extract_field(
|
603 |
+
examples, self.text_field, OUR_TEXT_FIELD
|
604 |
+
),
|
605 |
+
batched=True,
|
606 |
+
remove_columns=list(self.dset.features),
|
607 |
+
)
|
608 |
|
609 |
def do_tokenization(self):
|
610 |
"""
|
|
|
663 |
if save:
|
664 |
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
665 |
else:
|
666 |
+
if not self.live:
|
667 |
+
self.prepare_labels()
|
668 |
+
if save:
|
669 |
+
# save extracted label instances
|
670 |
+
self.label_dset.save_to_disk(self.label_dset_fid)
|
671 |
+
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
672 |
|
673 |
def prepare_labels(self):
|
674 |
+
if not self.live:
|
675 |
+
self.get_base_dataset()
|
676 |
+
self.label_dset = self.dset.map(
|
677 |
+
lambda examples: extract_field(
|
678 |
+
examples, self.label_field, OUR_LABEL_FIELD
|
679 |
+
),
|
680 |
+
batched=True,
|
681 |
+
remove_columns=list(self.dset.features),
|
682 |
+
)
|
683 |
+
self.label_df = self.label_dset.to_pandas()
|
684 |
+
self.fig_labels = make_fig_labels(
|
685 |
+
self.label_df, self.label_names, OUR_LABEL_FIELD
|
686 |
+
)
|
687 |
|
688 |
def load_or_prepare_npmi(self):
|
689 |
self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
|
|
|
727 |
by calling the nPMI class with the user's selections."""
|
728 |
|
729 |
def __init__(self, dataset_stats, use_cache=False):
|
730 |
+
self.live = dataset_stats.live
|
731 |
self.dstats = dataset_stats
|
732 |
self.pmi_cache_path = pjoin(self.dstats.cache_path, "pmi_files")
|
733 |
if not isdir(self.pmi_cache_path):
|
|
|
804 |
joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
|
805 |
# When maybe some things have been computed for the selected subgroups.
|
806 |
else:
|
807 |
+
if not self.live:
|
808 |
+
logs.info("Preparing new joint npmi")
|
809 |
+
joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
|
810 |
+
subgroup_pair, subgroup_files
|
811 |
+
)
|
812 |
+
# Cache new results
|
813 |
+
logs.info("Writing out.")
|
814 |
+
for subgroup in subgroup_pair:
|
815 |
+
write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
|
816 |
+
with open(joint_npmi_fid, "w+") as f:
|
817 |
+
joint_npmi_df.to_csv(f)
|
818 |
logs.info("The joint npmi df is")
|
819 |
logs.info(joint_npmi_df)
|
820 |
return joint_npmi_df
|
data_measurements/streamlit_utils.py
CHANGED
@@ -178,7 +178,11 @@ def expander_text_lengths(dstats, column_id):
|
|
178 |
value=0,
|
179 |
step=1,
|
180 |
)
|
181 |
-
|
|
|
|
|
|
|
|
|
182 |
|
183 |
|
184 |
### Third, use a sentence embedding model
|
@@ -273,7 +277,7 @@ def expander_text_duplicates(dstats, column_id):
|
|
273 |
st.write(
|
274 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
275 |
)
|
276 |
-
if dstats.dup_counts_df is None:
|
277 |
st.write("There are no duplicates in this dataset! 🥳")
|
278 |
else:
|
279 |
gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
|
|
|
178 |
value=0,
|
179 |
step=1,
|
180 |
)
|
181 |
+
|
182 |
+
# This is quite a large file and is breaking our ability to navigate the app development.
|
183 |
+
# Just passing if it's not already there for launch v0
|
184 |
+
if dstats.length_df is not None:
|
185 |
+
st.dataframe(dstats.length_df[dstats.length_df["length"] == start_id_show_lengths].set_index("length"))
|
186 |
|
187 |
|
188 |
### Third, use a sentence embedding model
|
|
|
277 |
st.write(
|
278 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
279 |
)
|
280 |
+
if dstats.dup_counts_df is None or dstats.dup_counts_df.empty:
|
281 |
st.write("There are no duplicates in this dataset! 🥳")
|
282 |
else:
|
283 |
gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
|