Spaces:
Runtime error
Runtime error
# Copyright 2021 The HuggingFace Team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import numpy as np | |
import pandas as pd | |
import sys | |
import utils | |
import utils.dataset_utils as ds_utils | |
import warnings | |
from collections import defaultdict | |
from os.path import exists | |
from os.path import join as pjoin | |
from sklearn.preprocessing import MultiLabelBinarizer | |
from utils.dataset_utils import (CNT, TOKENIZED_FIELD) | |
# Might be nice to print to log instead? Happens when we drop closed class. | |
warnings.filterwarnings(action="ignore", category=UserWarning) | |
# When we divide by 0 in log | |
np.seterr(divide="ignore") | |
# treating inf values as NaN as well | |
pd.set_option("use_inf_as_na", True) | |
logs = utils.prepare_logging(__file__) | |
# TODO: Should be possible for a user to specify this. | |
NUM_BATCHES = 500 | |
# For the associations of an identity term | |
SING = "associations" | |
# For the difference between the associations of identity terms | |
DIFF = "biases" | |
# Used in the figures we show in DMT | |
DMT = "combined" | |
def pair_terms(id_terms): | |
"""Creates alphabetically ordered paired terms based on the given terms.""" | |
pairs = [] | |
for i in range(len(id_terms)): | |
term1 = id_terms[i] | |
for j in range(i + 1, len(id_terms)): | |
term2 = id_terms[j] | |
# Use one ordering for a pair. | |
pair = tuple(sorted([term1, term2])) | |
pairs += [pair] | |
return pairs | |
class DMTHelper: | |
"""Helper class for the Data Measurements Tool. | |
This allows us to keep all variables and functions related to labels | |
in one file. | |
""" | |
def __init__(self, dstats, identity_terms, load_only=False, use_cache=False, | |
save=True): | |
# The data measurements tool settings (dataset, config, etc.) | |
self.dstats = dstats | |
# Whether we can use caching (when live, no). | |
self.load_only = load_only | |
# Whether to first try using cache before calculating | |
self.use_cache = use_cache | |
# Whether to save results | |
self.save = save | |
# Tokenized dataset | |
tokenized_df = dstats.tokenized_df | |
self.tokenized_sentence_df = tokenized_df[TOKENIZED_FIELD] | |
# Dataframe of shape #vocab x 1 (count) | |
self.vocab_counts_df = dstats.vocab_counts_df | |
# Cutoff for the number of times something must occur to be included | |
self.min_count = dstats.min_vocab_count | |
self.cache_path = pjoin(dstats.dataset_cache_dir, SING) | |
self.avail_terms_json_fid = pjoin(self.cache_path, | |
"identity_terms.json") | |
# TODO: Users ideally can type in whatever words they want. | |
# This is the full list of terms. | |
self.identity_terms = identity_terms | |
logs.info("Using term list:") | |
logs.info(self.identity_terms) | |
# identity_terms terms that are available more than MIN_VOCAB_COUNT | |
self.avail_identity_terms = [] | |
# TODO: Let users specify | |
self.open_class_only = True | |
# Single-word associations | |
self.assoc_results_dict = defaultdict(dict) | |
# Paired term association bias | |
self.bias_results_dict = defaultdict(dict) | |
# Dataframes used in displays. | |
self.bias_dfs_dict = defaultdict(dict) | |
# Results of the single word associations and their paired bias values. | |
# Formatted as: | |
# {(s1,s2)): {pd.DataFrame({s1-s2:diffs, s1:assoc, s2:assoc})}} | |
self.results_dict = defaultdict(lambda: defaultdict(dict)) | |
# Filenames for cache, based on the results | |
self.filenames_dict = defaultdict(dict) | |
def run_DMT_processing(self): | |
# The identity terms that can be used | |
self.load_or_prepare_avail_identity_terms() | |
# Association measurements & pair-wise differences for identity terms. | |
self.load_or_prepare_dmt_results() | |
def load_or_prepare_avail_identity_terms(self): | |
""" | |
Figures out what identity terms the user can select, based on whether | |
they occur more than self.min_vocab_count times | |
Provides identity terms -- uniquely and in pairs -- occurring at least | |
self.min_vocab_count times. | |
""" | |
# If we're trying to use the cache of available terms | |
if self.use_cache: | |
self.avail_identity_terms = self._load_identity_cache() | |
if self.avail_identity_terms: | |
logs.info( | |
"Loaded identity terms occuring >%s times" % self.min_count) | |
# Figure out the identity terms if we're not just loading from cache | |
if not self.load_only: | |
if not self.avail_identity_terms: | |
self.avail_identity_terms = self._prepare_identity_terms() | |
# Finish | |
if self.save: | |
self._write_term_cache() | |
def _load_identity_cache(self): | |
if exists(self.avail_terms_json_fid): | |
avail_identity_terms = ds_utils.read_json(self.avail_terms_json_fid) | |
return avail_identity_terms | |
return [] | |
def _prepare_identity_terms(self): | |
"""Uses DataFrame magic to return those terms that appear | |
greater than min_vocab times.""" | |
# Mask to get the identity terms | |
true_false = [term in self.vocab_counts_df.index for term in | |
self.identity_terms] | |
# List of identity terms | |
word_list_tmp = [x for x, y in zip(self.identity_terms, true_false) if | |
y] | |
# Whether said identity terms have a count > min_count | |
true_false_counts = [ | |
self.vocab_counts_df.loc[word, CNT] >= self.min_count for word in | |
word_list_tmp] | |
# List of identity terms with a count higher than min_count | |
avail_identity_terms = [word for word, y in | |
zip(word_list_tmp, true_false_counts) if y] | |
logs.debug("Identity terms that occur > %s times are:" % self.min_count) | |
logs.debug(avail_identity_terms) | |
return avail_identity_terms | |
def load_or_prepare_dmt_results(self): | |
# Initialize with no results (reset). | |
self.results_dict = {} | |
# Filenames for caching and saving | |
self._make_fids() | |
# If we're trying to use the cache of already computed results | |
if self.use_cache: | |
# Loads the association results and dataframes used in the display. | |
logs.debug("Trying to load...") | |
self.results_dict = self._load_dmt_cache() | |
# Compute results if we can | |
if not self.load_only: | |
# If there isn't a solution using cache | |
if not self.results_dict: | |
# Does the actual computations | |
self.prepare_results() | |
# Finish | |
if self.save: | |
# Writes the paired & singleton dataframe out. | |
self._write_dmt_cache() | |
def _load_dmt_cache(self): | |
""" | |
Loads dataframe with paired differences and individual item scores. | |
""" | |
results_dict = defaultdict(lambda: defaultdict(dict)) | |
pairs = pair_terms(self.avail_identity_terms) | |
for pair in pairs: | |
combined_fid = self.filenames_dict[DMT][pair] | |
if exists(combined_fid): | |
results_dict[pair] = ds_utils.read_df(combined_fid) | |
return results_dict | |
def prepare_results(self): | |
assoc_obj = nPMI(self.dstats.vocab_counts_df, | |
self.tokenized_sentence_df, | |
self.avail_identity_terms) | |
self.assoc_results_dict = assoc_obj.assoc_results_dict | |
self.results_dict = assoc_obj.bias_results_dict | |
def _prepare_dmt_dfs(self, measure="npmi"): | |
""" | |
Create the main dataframe that is used in the DMT, which lists | |
the npmi scores for each paired identity term and the difference between | |
them. The difference between them is the "bias". | |
""" | |
# Paired identity terms, associations and differences, in one dataframe. | |
bias_dfs_dict = defaultdict(dict) | |
logs.debug("bias results dict is") | |
logs.debug(self.bias_results_dict) | |
for pair in sorted(self.bias_results_dict): | |
combined_df = pd.DataFrame() | |
# Paired identity terms, values are the the difference between them. | |
combined_df[pair] = pd.DataFrame(self.bias_results_dict[pair]) | |
s1 = pair[0] | |
s2 = pair[1] | |
# Single identity term 1, values | |
combined_df[s1] = pd.DataFrame(self.assoc_results_dict[s1][measure]) | |
# Single identity term 2, values | |
combined_df[s2] = pd.DataFrame(self.assoc_results_dict[s2][measure]) | |
# Full dataframe with scores per-term, | |
# as well as the difference between. | |
bias_dfs_dict[pair] = combined_df | |
# {pair: {pd.DataFrame({(s1,s2)):diffs, s1:assocs, s2:assocs})}} | |
logs.debug("combined df is") | |
logs.debug(bias_dfs_dict) | |
return bias_dfs_dict | |
def _write_term_cache(self): | |
ds_utils.make_path(self.cache_path) | |
if self.avail_identity_terms: | |
ds_utils.write_json(self.avail_identity_terms, | |
self.avail_terms_json_fid) | |
def _write_dmt_cache(self, measure="npmi"): | |
ds_utils.make_path(pjoin(self.cache_path, measure)) | |
for pair, bias_df in self.results_dict.items(): | |
logs.debug("Results for pair is:") | |
logs.debug(bias_df) | |
fid = self.filenames_dict[DMT][pair] | |
logs.debug("Writing to %s" % fid) | |
ds_utils.write_df(bias_df, fid) | |
def _make_fids(self, measure="npmi"): | |
""" | |
Utility function to create filename/path strings for the different | |
result caches. This include single identity term results as well | |
as the difference between them. Also includes the datastructure used in | |
the DMT, which is a dataframe that has: | |
(term1, term2) difference, term1 (scores), term2 (scores) | |
""" | |
self.filenames_dict = {SING: {}, DIFF: {}, DMT: {}} | |
# When we have the available identity terms, | |
# we can make cache filenames for them. | |
for id_term in self.avail_identity_terms: | |
filename = SING + "-" + id_term + ".json" | |
json_fid = pjoin(self.cache_path, measure, filename) | |
self.filenames_dict[SING][id_term] = json_fid | |
paired_terms = pair_terms(self.avail_identity_terms) | |
for id_term_tuple in paired_terms: | |
# The paired association results (bias) are stored with these files. | |
id_term_str = '-'.join(id_term_tuple) | |
filename = DIFF + "-" + id_term_str + ".json" | |
json_fid = pjoin(self.cache_path, measure, filename) | |
self.filenames_dict[DIFF][id_term_tuple] = json_fid | |
# The display dataframes in the DMT are stored with these files. | |
filename = DMT + "-" + id_term_str + ".json" | |
json_fid = pjoin(self.cache_path, measure, filename) | |
self.filenames_dict[DMT][id_term_tuple] = json_fid | |
def get_display(self, s1, s2): | |
pair = tuple(sorted([s1, s2])) | |
display_df = self.results_dict[pair] | |
logs.debug(self.results_dict) | |
display_df.columns = ["bias", s1, s2] | |
return display_df | |
def get_filenames(self): | |
filenames = {"available terms": self.avail_terms_json_fid, | |
"results": self.filenames_dict} | |
return filenames | |
class nPMI: | |
""" | |
Uses the vocabulary dataframe and tokenized sentences to calculate | |
co-occurrence statistics, PMI, and nPMI | |
""" | |
def __init__(self, vocab_counts_df, tokenized_sentence_df, given_id_terms): | |
logs.debug("Initiating assoc class.") | |
self.vocab_counts_df = vocab_counts_df | |
# TODO: Change this logic so just the vocabulary is given. | |
self.vocabulary = list(vocab_counts_df.index) | |
self.vocab_counts = pd.DataFrame([0] * len(self.vocabulary)) | |
logs.debug("vocabulary is is") | |
logs.debug(self.vocab_counts_df) | |
self.tokenized_sentence_df = tokenized_sentence_df | |
logs.debug("tokenized sentences are") | |
logs.debug(self.tokenized_sentence_df) | |
self.given_id_terms = given_id_terms | |
logs.info("identity terms are") | |
logs.info(self.given_id_terms) | |
# Terms we calculate the difference between | |
self.paired_terms = pair_terms(given_id_terms) | |
# Matrix of # sentences x vocabulary size | |
self.word_cnts_per_sentence = self.count_words_per_sentence() | |
logs.info("Calculating results...") | |
# Formatted as {subgroup:{"count":{...},"npmi":{...}}} | |
self.assoc_results_dict = self.calc_measures() | |
# Dictionary keyed by pair tuples. Each value is a dataframe with | |
# vocab terms as the index, and columns of paired difference and | |
# individual scores for the two identity terms. | |
self.bias_results_dict = self.calc_bias(self.assoc_results_dict) | |
def count_words_per_sentence(self): | |
# Counts the number of each vocabulary item per-sentence in batches. | |
logs.info("Creating co-occurrence matrix for nPMI calculations.") | |
word_cnts_per_sentence = [] | |
logs.info(self.tokenized_sentence_df) | |
batches = np.linspace(0, self.tokenized_sentence_df.shape[0], | |
NUM_BATCHES).astype(int) | |
# Creates matrix of size # batches x # sentences | |
for batch_num in range(len(batches) - 1): | |
# Makes matrix shape: batch size (# sentences) x # words, | |
# with the occurrence of each word per sentence. | |
# vocab_counts_df.index is the vocabulary. | |
mlb = MultiLabelBinarizer(classes=self.vocabulary) | |
if batch_num % 100 == 0: | |
logs.debug( | |
"%s of %s sentence binarize batches." % ( | |
str(batch_num), str(len(batches))) | |
) | |
# Per-sentence word counts | |
sentence_batch = self.tokenized_sentence_df[ | |
batches[batch_num]:batches[batch_num + 1]] | |
mlb_series = mlb.fit_transform(sentence_batch) | |
word_cnts_per_sentence.append(mlb_series) | |
return word_cnts_per_sentence | |
def calc_measures(self): | |
id_results = {} | |
for subgroup in self.given_id_terms: | |
logs.info("Calculating for %s " % subgroup) | |
# Index of the identity term in the vocabulary | |
subgroup_idx = self.vocabulary.index(subgroup) | |
print("idx is %s" % subgroup_idx) | |
logs.debug("Calculating co-occurrences...") | |
vocab_cooc_df = self.calc_cooccurrences(subgroup, subgroup_idx) | |
logs.debug("Calculating PMI...") | |
pmi_df = self.calc_PMI(vocab_cooc_df, subgroup) | |
logs.debug("PMI dataframe is:") | |
logs.debug(pmi_df) | |
logs.debug("Calculating nPMI...") | |
npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup) | |
logs.debug("npmi df is") | |
logs.debug(npmi_df) | |
# Create a data structure for the identity term associations | |
id_results[subgroup] = {"count": vocab_cooc_df, | |
"pmi": pmi_df, | |
"npmi": npmi_df} | |
logs.debug("results_dict is:") | |
print(id_results) | |
return id_results | |
def calc_cooccurrences(self, subgroup, subgroup_idx): | |
initialize = True | |
coo_df = None | |
# Big computation here! Should only happen once. | |
logs.debug( | |
"Approaching big computation! Here, we binarize all words in the " | |
"sentences, making a sparse matrix of sentences." | |
) | |
for batch_id in range(len(self.word_cnts_per_sentence)): | |
# Every 100 batches, print out the progress. | |
if not batch_id % 100: | |
logs.debug( | |
"%s of %s co-occurrence count batches" | |
% (str(batch_id), str(len(self.word_cnts_per_sentence))) | |
) | |
# List of all the sentences (list of vocab) in that batch | |
batch_sentence_row = self.word_cnts_per_sentence[batch_id] | |
# Dataframe of # sentences in batch x vocabulary size | |
sent_batch_df = pd.DataFrame(batch_sentence_row) | |
# Subgroup counts per-sentence for the given batch | |
subgroup_df = sent_batch_df[subgroup_idx] | |
subgroup_df.columns = [subgroup] | |
# Remove the sentences where the count of the subgroup is 0. | |
# This way we have less computation & resources needs. | |
subgroup_df = subgroup_df[subgroup_df > 0] | |
mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0] | |
# Create cooccurrence matrix for the given subgroup and all words. | |
batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df)) | |
# Creates a batch-sized dataframe of co-occurrence counts. | |
# Note these could just be summed rather than be batch size. | |
if initialize: | |
coo_df = batch_coo_df | |
else: | |
coo_df = coo_df.add(batch_coo_df, fill_value=0) | |
initialize = False | |
logs.debug("Made co-occurrence matrix") | |
logs.debug(coo_df) | |
count_df = coo_df.set_index(self.vocab_counts_df.index) | |
count_df.columns = ["count"] | |
count_df["count"] = count_df["count"].astype(int) | |
return count_df | |
def calc_PMI(self, vocab_cooc_df, subgroup): | |
"""A | |
# PMI(x;y) = h(y) - h(y|x) | |
# = h(subgroup) - h(subgroup|word)az | |
# = log (p(subgroup|word) / p(subgroup)) | |
# nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) | |
""" | |
print("vocab cooc df") | |
print(vocab_cooc_df) | |
print("vocab counts") | |
print(self.vocab_counts_df["count"]) | |
# Calculation of p(subgroup) | |
subgroup_prob = self.vocab_counts_df.loc[subgroup]["proportion"] | |
# Calculation of p(subgroup|word) = count(subgroup,word) / count(word) | |
# Because the indices match (the vocab words), | |
# this division doesn't need to specify the index (I think?!) | |
vocab_cooc_df.columns = ["cooc"] | |
p_subgroup_g_word = ( | |
vocab_cooc_df["cooc"] / self.vocab_counts_df["count"]) | |
logs.info("p_subgroup_g_word is") | |
logs.info(p_subgroup_g_word) | |
pmi_df = pd.DataFrame() | |
pmi_df[subgroup] = np.log(p_subgroup_g_word / subgroup_prob).dropna() | |
# Note: A potentially faster solution for adding count, npmi, | |
# can be based on this zip idea: | |
# df_test['size_kb'], df_test['size_mb'], df_test['size_gb'] = | |
# zip(*df_test['size'].apply(sizes)) | |
return pmi_df | |
def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup): | |
""" | |
# nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) | |
# = -log(p(word|subgroup)p(word)) | |
""" | |
p_word_g_subgroup = vocab_cooc_df["cooc"] / sum(vocab_cooc_df["cooc"]) | |
logs.debug("p_word_g_subgroup") | |
logs.debug(p_word_g_subgroup) | |
p_word = pmi_df.apply( | |
lambda x: self.vocab_counts_df.loc[x.name]["proportion"], axis=1 | |
) | |
logs.debug("p word is") | |
logs.debug(p_word) | |
normalize_pmi = -np.log(p_word_g_subgroup * p_word) | |
npmi_df = pd.DataFrame() | |
npmi_df[subgroup] = pmi_df[subgroup] / normalize_pmi | |
return npmi_df.dropna() | |
def calc_bias(self, measurements_dict, measure="npmi"): | |
"""Uses the subgroup dictionaries to compute the differences across pairs. | |
Uses dictionaries rather than dataframes due to the fact that dicts seem | |
to be preferred amongst evaluate users so far. | |
:return: Dict of (id_term1, id_term2):{term1:diff, term2:diff ...}""" | |
paired_results_dict = {} | |
for pair in self.paired_terms: | |
paired_results = pd.DataFrame() | |
s1 = pair[0] | |
s2 = pair[1] | |
s1_results = measurements_dict[s1][measure] | |
s2_results = measurements_dict[s2][measure] | |
# !!! This is the final result of all the work !!! | |
word_diffs = s1_results[s1] - s2_results[s2] | |
paired_results[("%s - %s" % (s1, s2))] = word_diffs | |
paired_results[s1] = s1_results | |
paired_results[s2] = s2_results | |
paired_results_dict[pair] = paired_results.dropna() | |
logs.debug("Paired bias results from the main nPMI class are ") | |
logs.debug(paired_results_dict) | |
return paired_results_dict | |
def _write_debug_msg(self, batch_id, subgroup_df=None, | |
subgroup_sentences=None, msg_type="batching"): | |
if msg_type == "batching": | |
if not batch_id % 100: | |
logs.debug( | |
"%s of %s co-occurrence count batches" | |
% (str(batch_id), str(len(self.word_cnts_per_sentence))) | |
) | |
elif msg_type == "transpose": | |
if not batch_id % 100: | |
logs.debug("Removing 0 counts, subgroup_df is") | |
logs.debug(subgroup_df) | |
logs.debug("subgroup_sentences is") | |
logs.debug(subgroup_sentences) | |
logs.debug( | |
"Now we do the transpose approach for co-occurrences") | |