Spaces:

HuggingFaceM4
/

IDEFICS_Data_Measurement_Tool

Runtime error

File size: 9,221 Bytes

46df0b6

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TODO: Change print statements to logging?
# from evaluate import logging as logs
import warnings

import datasets
import evaluate
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

_CITATION = """\
Osman Aka, Ken Burke, Alex Bauerle, Christina Greer, and Margaret Mitchell. \
2021. Measuring Model Biases in the Absence of Ground Truth. \
In Proceedings of the 2021 AAAI/ACM Conference on AI, Ethics, and Society \
(AIES '21). Association for Computing Machinery, New York, NY, USA, 327–335. \
https://doi.org/10.1145/3461702.3462557
"""

_DESCRIPTION = """\
Normalized Pointwise Information (nPMI) is an entropy-based measurement
of association, used here to measure the association between words.
"""

_KWARGS_DESCRIPTION = """\
Args:
    references (list of lists): List of tokenized sentences.
    vocab_counts (dict or dataframe): Vocab terms and their counts
Returns:
    npmi_df: A dataframe with (1) nPMI association scores for each term; \
    (2) the difference between them.
"""

# TODO: Is this necessary?
warnings.filterwarnings(action="ignore", category=UserWarning)
# When we divide by 0 in log
np.seterr(divide="ignore")

# treating inf values as NaN as well
pd.set_option("use_inf_as_na", True)

# This can be changed to whatever a person likes;
# it is the number of batches to use when iterating through the vocabulary.
_NUM_BATCHES = 500
PROP = "proportion"
CNT = "count"

class nPMI(evaluate.Measurement):
    def _info(self):
        return evaluate.MeasurementInfo(
            module_type="measurement",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "references": datasets.Sequence(
                        datasets.Value("string", id="sequence"),
                        id="references"),
                }
            )
            # TODO: Create docs for this.
            # reference_urls=["https://huggingface.co/docs/..."],
        )

    def _compute(self, references, vocab_counts, subgroup):
        if isinstance(vocab_counts, dict):
            vocab_counts_df = pd.DataFrame.from_dict(vocab_counts,
                                                     orient='index',
                                                     columns=[CNT])
        elif isinstance(vocab_counts, pd.DataFrame):
            vocab_counts_df = vocab_counts
        else:
            print("Can't support the data structure for the vocab counts. =(")
            return
        # These are used throughout the rest of the functions
        self.references = references
        self.vocab_counts_df = vocab_counts_df
        self.vocab_counts_df[PROP] = vocab_counts_df[CNT] / sum(
            vocab_counts_df[CNT])
        # self.mlb_list holds num batches x num_sentences
        self.mlb_list = []
        # Index of the subgroup word in the sparse vector
        subgroup_idx = vocab_counts_df.index.get_loc(subgroup)
        print("Calculating co-occurrences...")
        df_coo = self.calc_cooccurrences(subgroup, subgroup_idx)
        vocab_cooc_df = self.set_idx_cols(df_coo, subgroup)
        print("Calculating PMI...")
        pmi_df = self.calc_PMI(vocab_cooc_df, subgroup)
        print("Calculating nPMI...")
        npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup)
        npmi_bias = npmi_df.max(axis=0) + abs(npmi_df.min(axis=0))
        return {"bias": npmi_bias, "co-occurrences": vocab_cooc_df,
                "pmi": pmi_df, "npmi": npmi_df}

    def _binarize_words_in_sentence(self):
        print("Creating co-occurrence matrix for PMI calculations.")
        batches = np.linspace(0, len(self.references), _NUM_BATCHES).astype(int)
        i = 0
        # Creates list of size (# batches x # sentences)
        while i < len(batches) - 1:
            # Makes a sparse matrix (shape: # sentences x # words),
            # with the occurrence of each word per sentence.
            mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index)
            print(
                "%s of %s sentence binarize batches." % (
                str(i), str(len(batches)))
            )
            # Returns series: batch size x num_words
            mlb_series = mlb.fit_transform(
                self.references[batches[i]:batches[i + 1]]
            )
            i += 1
            self.mlb_list.append(mlb_series)

    def calc_cooccurrences(self, subgroup, subgroup_idx):
        initialize = True
        coo_df = None
        # Big computation here!  Should only happen once.
        print(
            "Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences."
        )
        if not self.mlb_list:
            self._binarize_words_in_sentence()
        for batch_id in range(len(self.mlb_list)):
            print(
                "%s of %s co-occurrence count batches"
                % (str(batch_id), str(len(self.mlb_list)))
            )
            # List of all the sentences (list of vocab) in that batch
            batch_sentence_row = self.mlb_list[batch_id]
            # Dataframe of # sentences in batch x vocabulary size
            sent_batch_df = pd.DataFrame(batch_sentence_row)
            # Subgroup counts per-sentence for the given batch
            subgroup_df = sent_batch_df[subgroup_idx]
            subgroup_df.columns = [subgroup]
            # Remove the sentences where the count of the subgroup is 0.
            # This way we have less computation & resources needs.
            subgroup_df = subgroup_df[subgroup_df > 0]
            mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
            # Create cooccurrence matrix for the given subgroup and all words.
            batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))

            # Creates a batch-sized dataframe of co-occurrence counts.
            # Note these could just be summed rather than be batch size.
            if initialize:
                coo_df = batch_coo_df
            else:
                coo_df = coo_df.add(batch_coo_df, fill_value=0)
            initialize = False
        print("Returning co-occurrence matrix")
        return pd.DataFrame(coo_df)

    def set_idx_cols(self, df_coo, subgroup):
        """
        :param df_coo: Co-occurrence counts for subgroup, length is num_words
        :return:
        """
        count_df = df_coo.set_index(self.vocab_counts_df.index)
        count_df.columns = [subgroup + "-count"]
        count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(
            int)
        return count_df

    def calc_PMI(self, vocab_cooc_df, subgroup):
        """
        # PMI(x;y) = h(y) - h(y|x)
        #          = h(subgroup) - h(subgroup|word)
        #          = log (p(subgroup|word) / p(subgroup))
        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
        """
        # Calculation of p(subgroup)
        # TODO: Is this better?
        #  subgroup_prob = vocab_counts_df.loc[subgroup][PROP]
        subgroup_prob = self.vocab_counts_df.loc[subgroup][CNT] / sum(
            self.vocab_counts_df[CNT])
        # Calculation of p(subgroup|word) = count(subgroup,word) / count(word)
        # Because the indices match (the vocab words),
        # this division doesn't need to specify the index (I think?!)
        p_subgroup_g_word = (
                vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df[
            CNT]
        )
        pmi_df = pd.DataFrame()
        pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob)
        # Note: A potentially faster solution for adding count, npmi,
        # can be based on this zip idea:
        # df_test['size_kb'],  df_test['size_mb'], df_test['size_gb'] =
        # zip(*df_test['size'].apply(sizes))
        return pmi_df.dropna()

    def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup):
        """
        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
        #                                           = -log(p(word|subgroup)p(word))
        """
        p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum(
            vocab_cooc_df[subgroup + "-count"]
        )
        p_word = pmi_df.apply(
            lambda x: self.vocab_counts_df.loc[x.name][PROP], axis=1
        )
        normalize_pmi = -np.log(p_word_g_subgroup * p_word)
        npmi_df = pd.DataFrame()
        npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi
        return npmi_df.dropna()