|
import os |
|
import torch |
|
import dash |
|
import streamlit as st |
|
import pandas as pd |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
from transformers import pipeline |
|
from plotly.subplots import make_subplots |
|
import plotly.graph_objects as go |
|
import plotly.express as px |
|
|
|
|
|
def z_score(y, mean=.04853076, sd=.9409466): |
|
return (y - mean) / sd |
|
|
|
def indicator_plot(value, title, value_range, domain): |
|
|
|
plot = go.Indicator( |
|
mode = "gauge+delta", |
|
value = value, |
|
domain = domain, |
|
title = title, |
|
delta = { |
|
'reference': 0, |
|
'decreasing': {'color': "#ec4899"}, |
|
'increasing': {'color': "#36def1"} |
|
}, |
|
gauge = { |
|
'axis': {'range': value_range, 'tickwidth': 1, 'tickcolor': "black"}, |
|
'bar': {'color': "#4361ee"}, |
|
'bgcolor': "white", |
|
'borderwidth': 2, |
|
'bordercolor': "#efefef", |
|
'steps': [ |
|
{'range': [value_range[0], 0], 'color': '#efefef'}, |
|
{'range': [0, value_range[1]], 'color': '#efefef'} |
|
], |
|
'threshold': { |
|
'line': {'color': "#4361ee", 'width': 8}, |
|
'thickness': 0.75, |
|
'value': value |
|
} |
|
} |
|
) |
|
|
|
return plot |
|
|
|
def scatter_plot(df, group_var): |
|
|
|
colors = ['#36def1', '#4361ee'] if group_var else ['#4361ee'] |
|
|
|
plot = px.scatter( |
|
df, |
|
x='Machine-ratings', |
|
y='Human-ratings', |
|
color=group_var, |
|
facet_col='x_group', |
|
facet_col_wrap=2, |
|
trendline='ols', |
|
trendline_scope='trace', |
|
hover_data={ |
|
'Text': df.text, |
|
'Language': False, |
|
'x_group': False, |
|
'Human-ratings': ':.2f', |
|
'Machine-ratings': ':.2f', |
|
'Study': df.study, |
|
'Instrument': df.instrument, |
|
}, |
|
width=400, |
|
height=400, |
|
color_discrete_sequence=colors |
|
) |
|
|
|
plot.for_each_annotation(lambda a: a.update(text=a.text.split('=')[-1])) |
|
plot.update_layout( |
|
legend={ |
|
'orientation':'h', |
|
'yanchor': 'bottom', |
|
'y': -.30 |
|
}) |
|
plot.update_xaxes(title_standoff = 0) |
|
|
|
return plot |
|
|
|
|
|
covariate_columns = { |
|
'content_domain': 'Content Domain', |
|
'language': 'Language', |
|
'rater_group': 'Rater Group', |
|
} |
|
|
|
df = ( |
|
pd |
|
.read_feather(path='data.feather').query('partition == "test" | partition == "dev"') |
|
.melt( |
|
value_vars=['sentiment_model', 'desirability_model'], |
|
var_name='x_group', |
|
value_name='x', |
|
id_vars=['mean_z', 'text', 'content_domain', 'language', 'rater_group', 'study', 'instrument'] |
|
) |
|
.replace( |
|
to_replace={ |
|
'en': 'English', |
|
'de': 'German', |
|
'other': 'Other', |
|
'personality': 'Personality', |
|
'laypeople': 'Laypeople', |
|
'students': 'Students', |
|
'sentiment_model': 'Sentiment Model', |
|
'desirability_model': 'Desirability Model' |
|
} |
|
) |
|
.rename(columns=covariate_columns) |
|
.rename( |
|
columns={ |
|
'mean_z': 'Human-ratings', |
|
'x': 'Machine-ratings', |
|
} |
|
) |
|
) |
|
|
|
st.markdown(""" |
|
# NLP for Item Desirability Ratings |
|
This web application accompanies the paper "*Expanding the Methodological Toolbox: Machine-Based Item Desirability Ratings as an Alternative to Human-Based Ratings*". |
|
|
|
## What is this research about? |
|
Researchers use personality scales to measure people's traits and behaviors, but biases can affect the accuracy of these scales. |
|
Socially desirable responding is a common bias that can skew results. To overcome this, researchers gather item desirability ratings, e.g., to ensure that questions are neutral. |
|
Recently, advancements in natural language processing have made it possible to use machines to estimate social desirability ratings, |
|
which can provide a viable alternative to human ratings and help researchers, scale developers, and practitioners improve the accuracy of personality scales. |
|
""") |
|
|
|
|
|
st.markdown(""" |
|
## Try it yourself! |
|
Use the text field below to enter a statement that might be part of a psychological questionnaire (e.g., "I love a good fight."). |
|
The left dial indicates how socially desirable it might be to endorse this item. |
|
The right dial indicates sentiment (i.e., valence) as estimated by regular sentiment analysis (using the `cardiffnlp/twitter-xlm-roberta-base-sentiment` model). |
|
""") |
|
|
|
|
|
with st.spinner('Processing...'): |
|
|
|
if os.environ.get('item-desirability'): |
|
model_path = 'magnolia-psychometrics/item-desirability' |
|
else: |
|
model_path = '/nlp/nlp/models/finetuned/twitter-xlm-roberta-base-regressive-desirability-ft-4' |
|
|
|
auth_token = os.environ.get('item-desirability') or True |
|
|
|
if 'tokenizer' not in globals(): |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
pretrained_model_name_or_path=model_path, |
|
use_fast=True, |
|
use_auth_token=auth_token |
|
) |
|
|
|
if 'model' not in globals(): |
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
pretrained_model_name_or_path=model_path, |
|
num_labels=1, |
|
ignore_mismatched_sizes=True, |
|
use_auth_token=auth_token |
|
) |
|
|
|
|
|
if 'classifier' not in globals(): |
|
sentiment_model = 'cardiffnlp/twitter-xlm-roberta-base-sentiment' |
|
classifier = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_model, use_fast=False, top_k=3) |
|
|
|
input_text = st.text_input( |
|
label='Estimate item desirability:', |
|
value='I love a good fight.', |
|
placeholder='Enter item text' |
|
) |
|
|
|
if input_text: |
|
|
|
classifier_output = classifier(input_text) |
|
classifier_output_dict = {x['label']: x['score'] for x in classifier_output[0]} |
|
classifier_score = classifier_output_dict['positive'] - classifier_output_dict['negative'] |
|
|
|
inputs = tokenizer(input_text, padding=True, return_tensors='pt') |
|
|
|
with torch.no_grad(): |
|
score = model(**inputs).logits.squeeze().tolist() |
|
z = z_score(score) |
|
|
|
p1 = indicator_plot( |
|
value=classifier_score, |
|
title=f'Item Sentiment', |
|
value_range=[-1, 1], |
|
domain={'x': [.55, 1], 'y': [0, 1]} |
|
) |
|
|
|
p2 = indicator_plot( |
|
value=z, |
|
title=f'Item Desirability', |
|
value_range=[-4, 4], |
|
domain={'x': [0, .45], 'y': [0, 1]}, |
|
) |
|
|
|
fig = go.Figure() |
|
fig.add_trace(p1) |
|
fig.add_trace(p2) |
|
|
|
fig.update_layout( |
|
title=dict(text=f'"{input_text}"', font=dict(size=36),yref='paper'), |
|
paper_bgcolor = "white", |
|
font = {'color': "black", 'family': "Arial"}) |
|
|
|
st.plotly_chart(fig, theme=None, use_container_width=True) |
|
|
|
st.markdown(""" |
|
Item sentiment: Absolute differences between positive and negative sentiment. |
|
Item desirability: z-transformed values, 0 indicated "neutral". |
|
""") |
|
|
|
|
|
st.markdown(""" |
|
## Explore the data |
|
Figures show the accuarcy in precitions of human-rated item desirability by the sentiment model (left) and the desirability model (right), using `test`-partition data only. |
|
""") |
|
|
|
|
|
show_covariates = st.checkbox('Show covariates', value=True) |
|
|
|
if show_covariates: |
|
option = st.selectbox('Group by', options=list(covariate_columns.values())) |
|
else: |
|
option = None |
|
|
|
plot = scatter_plot(df, option) |
|
|
|
st.plotly_chart(plot, theme=None, use_container_width=True) |