Spaces:
Runtime error
Runtime error
def main(): | |
""" | |
Creates a Streamlit web app that classifies a given body of text as either human-made or AI-generated, | |
using a pre-trained model. | |
""" | |
import streamlit as st | |
import numpy as np | |
import joblib | |
import string | |
import time | |
import scipy | |
import spacy | |
import re | |
from transformers import AutoTokenizer | |
import torch | |
from eli5.lime import TextExplainer | |
from eli5.lime.samplers import MaskingTextSampler | |
import eli5 | |
import shap | |
from custom_models import HF_DistilBertBasedModelAppDocs, HF_BertBasedModelAppDocs | |
# Initialize Spacy | |
nlp = spacy.load("en_core_web_sm") | |
# device to run DL model | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
def format_text(text: str) -> str: | |
""" | |
This function takes a string as input and returns a formatted version of the string. | |
The function replaces specific substrings in the input string with empty strings, | |
converts the string to lowercase, removes any leading or trailing whitespace, | |
and removes any punctuation from the string. | |
""" | |
text = nlp(text) | |
text = " ".join([token.text for token in text if token.ent_type_ not in ["PERSON", "DATE"]]) | |
pattern = r"\b[A-Za-z]+\d+\b" | |
text = re.sub(pattern, "", text) | |
return text.replace("REDACTED", "").lower().replace("[Name]", "").replace("[your name]", "").\ | |
replace("dear admissions committee,", "").replace("sincerely,","").\ | |
replace("[university's name]","fordham").replace("dear sir/madam,","").\ | |
replace("โ statement of intent ","").\ | |
replace('program: master of science in data analytics name of applicant: ',"").\ | |
replace("data analytics", "data science").replace("| \u200b","").\ | |
replace("m.s. in data science at lincoln center ","").\ | |
translate(str.maketrans('', '', string.punctuation)).strip().lstrip() | |
# Define the function to classify text | |
def nb_lr(model, text: str) -> (int, float): | |
""" | |
This function takes a previously trained Sklearn Pipeline | |
model (NaiveBayes or Logistic Regression), then returns prediction probability, | |
and the final prediction as a tuple. | |
""" | |
# Clean and format the input text | |
text = format_text(text) | |
# Predict using either LR or NB and get prediction probability | |
prediction = model.predict([text]).item() | |
predict_proba = round(model.predict_proba([text]).squeeze()[prediction].item(),4) | |
return prediction, predict_proba | |
def torch_pred(tokenizer, model, text): | |
""" | |
This function takes a pre-trained tokenizer, a previously trained transformer-based model | |
model (DistilBert or Bert), then returns prediction probability, | |
and the final prediction as a tuple. | |
""" | |
# DL models (BERT/DistilBERT based models) | |
cleaned_text_tokens = tokenizer([text], padding='max_length', max_length=512, truncation=True) | |
with torch.inference_mode(): | |
input_ids, att = cleaned_text_tokens["input_ids"], cleaned_text_tokens["attention_mask"] | |
input_ids = torch.tensor(input_ids).to(device) | |
attention_mask = torch.tensor(att).to(device) | |
logits = model(input_ids=input_ids, attention_mask=attention_mask) | |
_, prediction = torch.max(logits, 1) | |
prediction = prediction.item() | |
predict_proba = round(torch.softmax(logits, 1).cpu().squeeze().tolist()[prediction],4) | |
return prediction, predict_proba | |
def pred_str(prediction:int) -> str: | |
""" | |
This function takes an integer value as input and returns a string representing the type of the input's source. | |
The input is expected to be a prediction from a classification model that distinguishes between human-made and AI-generated text. | |
""" | |
# Map the predicted class to string output | |
if prediction == 0: | |
return "Human-made ๐คทโโ๏ธ๐คทโโ๏ธ" | |
else: | |
return "Generated with AI ๐ฆพ" | |
def load_tokenizer(option): | |
""" | |
Load pre-trained tokenizer and and save in cache memory. | |
""" | |
if option == "BERT-based model": | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding='max_length', max_length=512, truncation=True) | |
else: | |
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", padding='max_length', max_length=512, truncation=True) | |
return tokenizer | |
def load_model(option): | |
""" | |
Load trained Transformer-based models and save in cache memory. | |
""" | |
if option == "BERT-based model": | |
model = HF_BertBasedModelAppDocs.from_pretrained("ferdmartin/HF_BertBasedModelAppDocs2").to(device) | |
else: | |
model = HF_DistilBertBasedModelAppDocs.from_pretrained("ferdmartin/HF_DistilBertBasedModelAppDocs2").to(device) | |
return model | |
# Streamlit app: | |
# List of models available | |
models_available = {"Logistic Regression":"models/baseline_model_lr2.joblib", | |
"Naive Bayes": "models/baseline_model_nb2.joblib", | |
"DistilBERT-based model (BERT light)": "ferdmartin/HF_DistilBertBasedModelAppDocs", | |
"BERT-based model": "ferdmartin/HF_BertBasedModelAppDocs" | |
} | |
st.set_page_config(page_title="AI/Human GradAppDocs", page_icon="๐ค", layout="wide") | |
st.title("Academic Application Document Classifier") | |
st.header("Is it human-made ๐ or Generated with AI ๐ค ? ") | |
# Check the model to use | |
def restore_prediction_state(): | |
"""Restore session_state variable to clear prediction after changing model""" | |
if "prediction" in st.session_state: | |
del st.session_state.prediction | |
option = st.selectbox("Select a model to use:", models_available, on_change=restore_prediction_state) | |
# Load the selected trained model | |
if option in ("BERT-based model", "DistilBERT-based model (BERT light)"): | |
tokenizer = load_tokenizer(option) | |
model = load_model(option) | |
else: | |
model = joblib.load(models_available[option]) | |
text = st.text_area("Enter either a statement of intent or a letter of recommendation:") | |
#Hide footer "made with streamlit" | |
hide_st_style = """ | |
<style> | |
footer {visibility: hidden;} | |
header {visibility: hidden;} | |
</style> | |
""" | |
st.markdown(hide_st_style, unsafe_allow_html=True) | |
# Use model | |
if st.button("Let's check this text!"): | |
if text.strip() == "": | |
# In case there is no input for the model | |
st.error("Please enter some text") | |
else: | |
with st.spinner("Wait for the magic ๐ช๐ฎ"): | |
# Use models | |
if option in ("Naive Bayes", "Logistic Regression"): # Use Sklearn pipeline models | |
prediction, predict_proba = nb_lr(model, text) | |
st.session_state["sklearn"] = True | |
else: | |
prediction, predict_proba = torch_pred(tokenizer, model, text) # Use transformers | |
st.session_state["torch"] = True | |
# Store the result in session state | |
st.session_state["color_pred"] = "blue" if prediction == 0 else "red" # Set color for prediction output string | |
prediction = pred_str(prediction) # Map predictions (int => str) | |
st.session_state["prediction"] = prediction | |
st.session_state["predict_proba"] = predict_proba | |
st.session_state["text"] = text | |
# Print result | |
st.markdown(f"I think this text is: **:{st.session_state['color_pred']}[{st.session_state['prediction']}]** (Prediction probability: {st.session_state['predict_proba'] * 100}%)") | |
elif "prediction" in st.session_state: | |
# Display the stored result if available | |
st.markdown(f"I think this text is: **:{st.session_state['color_pred']}[{st.session_state['prediction']}]** (Prediction probability: {st.session_state['predict_proba'] * 100}%)") | |
if st.button("Model Explanation"): | |
# Check if there's text in the session state | |
if "text" in st.session_state and "prediction" in st.session_state: | |
if option in ("Naive Bayes", "Logistic Regression"): | |
with st.spinner('Wait for it ๐ญ...'): | |
explainer = TextExplainer(sampler=MaskingTextSampler()) | |
explainer.fit(st.session_state["text"], model.predict_proba) | |
html = eli5.format_as_html(explainer.explain_prediction(target_names=["Human", "AI"])) | |
else: | |
with st.spinner('Wait for it ๐ญ... BERT-based model explanations take around 4-10 minutes. In case you want to abort, refresh the page.'): | |
def f(x): | |
"""TORCH EXPLAINER PRED FUNC (USES logits)""" | |
tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x])#.cuda() | |
outputs = model(tv).detach().cpu().numpy() | |
scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T | |
val = scipy.special.logit(scores[:,1]) # use one vs rest logit units | |
return val | |
explainer = shap.Explainer(f, tokenizer) # build explainer using masking tokens and selected transformer-based model | |
shap_values = explainer([st.session_state["text"]], fixed_context=1) | |
html = shap.plots.text(shap_values, display=False) | |
# Render HTML | |
st.components.v1.html(html, height=500, scrolling = True) | |
else: | |
st.error("Please enter some text and click 'Let's check!' before requesting an explanation.") | |
if __name__ == "__main__": | |
main() |