Spaces:
Running
Running
# %% | |
# from http.client import TEMPORARY_REDIRECT | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import random | |
from matplotlib.ticker import MaxNLocator | |
from transformers import pipeline | |
from winogender_sentences import get_sentences | |
MODEL_NAMES = ["roberta-large", "roberta-base", | |
"bert-large-uncased", "bert-base-uncased"] | |
OWN_MODEL_NAME = 'add-a-model' | |
PICK_YOUR_OWN_LABEL = 'pick-your-own' | |
DECIMAL_PLACES = 1 | |
EPS = 1e-5 # to avoid /0 errors | |
NUM_PTS_TO_AVERAGE = 4 | |
# Example date conts | |
DATE_SPLIT_KEY = "DATE" | |
START_YEAR = 1901 | |
STOP_YEAR = 2016 | |
NUM_PTS = 30 | |
DATES = np.linspace(START_YEAR, STOP_YEAR, NUM_PTS).astype(int).tolist() | |
DATES = [f'{d}' for d in DATES] | |
GENDERED_LIST = [ | |
['he', 'she'], | |
['him', 'her'], | |
['his', 'hers'], | |
["himself", "herself"], | |
['male', 'female'], | |
# ['man', 'woman'] Explicitly added in winogender extended sentences | |
['men', 'women'], | |
["husband", "wife"], | |
['father', 'mother'], | |
['boyfriend', 'girlfriend'], | |
['brother', 'sister'], | |
["actor", "actress"], | |
] | |
# %% | |
# Fire up the models | |
models = dict() | |
for bert_like in MODEL_NAMES: | |
models[bert_like] = pipeline("fill-mask", model=bert_like) | |
# %% | |
# Get the winogender sentences | |
winogender_sentences = get_sentences() | |
occs = sorted(list({sentence_id.split('_')[0] | |
for sentence_id in winogender_sentences})) | |
# %% | |
def get_gendered_token_ids(): | |
male_gendered_tokens = [list[0] for list in GENDERED_LIST] | |
female_gendered_tokens = [list[1] for list in GENDERED_LIST] | |
return male_gendered_tokens, female_gendered_tokens | |
def get_winogender_texts(occ): | |
return [winogender_sentences[id] for id in winogender_sentences.keys() if id.split('_')[0] == occ] | |
def display_input_texts(occ, alt_text): | |
if occ == PICK_YOUR_OWN_LABEL: | |
texts = alt_text.split('\n') | |
else: | |
texts = get_winogender_texts(occ) | |
display_texts = [ | |
f"{i+1}) {text}" for (i, text) in enumerate(texts)] | |
return "\n".join(display_texts), texts | |
def get_avg_prob_from_pipeline_outputs(pipeline_preds, gendered_tokens, num_preds): | |
pronoun_preds = [sum([ | |
pronoun["score"] if pronoun["token_str"].strip( | |
).lower() in gendered_tokens else 0.0 | |
for pronoun in top_preds]) | |
for top_preds in pipeline_preds | |
] | |
return round(sum(pronoun_preds) / (EPS + num_preds) * 100, DECIMAL_PLACES) | |
def is_top_pred_gendered(pipeline_preds, gendered_tokens): | |
return pipeline_preds[0][0]['token_str'].strip().lower() in gendered_tokens | |
# %% | |
def get_figure(df, model_name, occ): | |
xs = df[df.columns[0]] | |
ys = df[df.columns[1]] | |
fig, ax = plt.subplots() | |
# Trying small fig due to rendering issues on HF, not on VS Code | |
fig.set_figheight(3) | |
fig.set_figwidth(9) | |
ax.bar(xs, ys) | |
ax.axis('tight') | |
ax.set_xlabel("Sentence number") | |
ax.set_ylabel("Uncertainty metric") | |
ax.set_title( | |
f"Uncertainty in {model_name} gender pronoun predictions in {occ} sentences.") | |
return fig | |
# %% | |
def predict_gender_pronouns( | |
model_name, | |
own_model_name, | |
texts, | |
occ, | |
): | |
"""Run inference on input_text for selected model type, returning uncertainty results. | |
""" | |
# TODO: make these selectable by user | |
indie_vars = ', '.join(DATES) | |
num_ave = NUM_PTS_TO_AVERAGE | |
# For debugging | |
print('input_texts', texts) | |
if model_name is None or model_name == '': | |
model = models[MODEL_NAMES[0]] | |
elif model_name not in MODEL_NAMES: | |
model = pipeline("fill-mask", model=own_model_name) | |
else: | |
model = models[model_name] | |
mask_token = model.tokenizer.mask_token | |
indie_vars_list = indie_vars.split(',') | |
male_gendered_tokens, female_gendered_tokens = get_gendered_token_ids() | |
masked_texts = [text.replace('MASK', mask_token) for text in texts] | |
all_uncertainty_f = {} | |
not_top_gendered = set() | |
for i, text in enumerate(masked_texts): | |
female_pronoun_preds = [] | |
male_pronoun_preds = [] | |
top_pred_gendered = True # Assume true unless told otherwise | |
print(f"{i+1}) {text}") | |
for indie_var in indie_vars_list[:num_ave] + indie_vars_list[-num_ave:]: | |
target_text = f"In {indie_var}: {text}" | |
pipeline_preds = model(target_text) | |
# Quick hack as realized return type based on how many MASKs in text. | |
if type(pipeline_preds[0]) is not list: | |
pipeline_preds = [pipeline_preds] | |
# If top-pred not gendered, record as such | |
if not is_top_pred_gendered(pipeline_preds, female_gendered_tokens + male_gendered_tokens): | |
top_pred_gendered = False | |
num_preds = 1 # By design | |
female_pronoun_preds.append(get_avg_prob_from_pipeline_outputs( | |
pipeline_preds, | |
female_gendered_tokens, | |
num_preds | |
)) | |
male_pronoun_preds.append(get_avg_prob_from_pipeline_outputs( | |
pipeline_preds, | |
male_gendered_tokens, | |
num_preds | |
)) | |
# Normalizing by all gendered predictions | |
total_gendered_probs = np.add( | |
female_pronoun_preds, male_pronoun_preds) | |
norm_female_pronoun_preds = np.around( | |
np.divide(female_pronoun_preds, total_gendered_probs+EPS)*100, | |
decimals=DECIMAL_PLACES | |
) | |
sent_idx = f"{i+1}" if top_pred_gendered else f"{i+1}*" | |
all_uncertainty_f[sent_idx] = round(abs((sum(norm_female_pronoun_preds[-num_ave:]) - sum(norm_female_pronoun_preds[:num_ave])) | |
/ num_ave), DECIMAL_PLACES) | |
uncertain_df = pd.DataFrame.from_dict( | |
all_uncertainty_f, orient='index', columns=['Uncertainty metric']) | |
uncertain_df = uncertain_df.reset_index().rename( | |
columns={'index': 'Sentence number'}) | |
return ( | |
uncertain_df, | |
get_figure(uncertain_df, model_name, occ), | |
) | |
# %% | |
demo = gr.Blocks() | |
with demo: | |
input_texts = gr.Variable([]) | |
gr.Markdown("## Are you certain?") | |
gr.Markdown( | |
"LLMs are pretty good at reporting their uncertainty. We just need to ask the right way.") | |
gr.Markdown("Using our uncertainty metric informed by applying causal inference techniques in \ | |
[Selection Collider Bias in Large Language Models](https://arxiv.org/abs/2208.10063), \ | |
we are able to identify likely spurious correlations and exploit them in \ | |
the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \ | |
below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)") | |
gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\ | |
eight syntactically similar sentences. However semantically, \ | |
only two of the sentences are gender-specified while the rest remain gender-underspecified") | |
gr.Markdown("If a model can reliably tell us when it is uncertain about its predictions, one can replace only those uncertain predictions with\ | |
information retrieval methods, or in the case of gender pronoun prediction, a coin toss.") | |
with gr.Row(): | |
model_name = gr.Radio( | |
MODEL_NAMES + [OWN_MODEL_NAME], | |
type="value", | |
label="Pick a preloaded BERT-like model for uncertainty evaluation (note: BERT-base performance least consistant)...", | |
) | |
own_model_name = gr.Textbox( | |
label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \ | |
(that supports the [fill-mask task](https://huggingface.co/models?pipeline_tag=fill-mask)) here.", | |
) | |
with gr.Row(): | |
occ_box = gr.Radio( | |
occs+[PICK_YOUR_OWN_LABEL], label=f"Pick an Occupation type from the Winogender Schemas evaluation set, or select '{PICK_YOUR_OWN_LABEL}'\ | |
(it need not be about an occupation).") | |
with gr.Row(): | |
alt_input_texts = gr.Textbox( | |
lines=2, | |
label=f"...If you selected '{PICK_YOUR_OWN_LABEL}' above, add your own texts new-line delimited sentences here. Be sure\ | |
to include a single MASK-ed out pronoun. \ | |
If unsure on the required format, click an occupation above instead, to see some example input texts for this round.", | |
) | |
with gr.Row(): | |
get_text_btn = gr.Button("Load input texts") | |
get_text_btn.click( | |
fn=display_input_texts, | |
inputs=[occ_box, alt_input_texts], | |
outputs=[gr.Textbox( | |
label='Numbered sentences for evaluation. Number below corresponds to number in x-axis of plot.'), input_texts], | |
) | |
with gr.Row(): | |
uncertain_btn = gr.Button("Get uncertainty results!") | |
gr.Markdown( | |
"If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.") | |
with gr.Row(): | |
female_fig = gr.Plot(type="auto") | |
with gr.Row(): | |
female_df = gr.Dataframe() | |
uncertain_btn.click( | |
fn=predict_gender_pronouns, | |
inputs=[model_name, own_model_name, input_texts, occ_box], | |
# inputs=date_example, | |
outputs=[female_df, female_fig] | |
) | |
demo.launch(debug=True) | |
# %% | |