Spaces:
Runtime error
Runtime error
ferdmartin
commited on
Commit
•
4fc2f5a
1
Parent(s):
e59445e
Update app.py
Browse files
app.py
CHANGED
@@ -49,7 +49,12 @@ def main():
|
|
49 |
translate(str.maketrans('', '', string.punctuation)).strip().lstrip()
|
50 |
|
51 |
# Define the function to classify text
|
52 |
-
def nb_lr(model, text):
|
|
|
|
|
|
|
|
|
|
|
53 |
# Clean and format the input text
|
54 |
text = format_text(text)
|
55 |
# Predict using either LR or NB and get prediction probability
|
@@ -58,6 +63,11 @@ def main():
|
|
58 |
return prediction, predict_proba
|
59 |
|
60 |
def torch_pred(tokenizer, model, text):
|
|
|
|
|
|
|
|
|
|
|
61 |
# DL models (BERT/DistilBERT based models)
|
62 |
cleaned_text_tokens = tokenizer([text], padding='max_length', max_length=512, truncation=True)
|
63 |
with torch.inference_mode():
|
@@ -70,7 +80,11 @@ def main():
|
|
70 |
predict_proba = round(torch.softmax(logits, 1).cpu().squeeze().tolist()[prediction],4)
|
71 |
return prediction, predict_proba
|
72 |
|
73 |
-
def pred_str(prediction):
|
|
|
|
|
|
|
|
|
74 |
# Map the predicted class to string output
|
75 |
if prediction == 0:
|
76 |
return "Human-made 🤷♂️🤷♀️"
|
@@ -79,6 +93,9 @@ def main():
|
|
79 |
|
80 |
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
81 |
def load_tokenizer(option):
|
|
|
|
|
|
|
82 |
if option == "BERT-based model":
|
83 |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding='max_length', max_length=512, truncation=True)
|
84 |
else:
|
@@ -87,6 +104,9 @@ def main():
|
|
87 |
|
88 |
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
89 |
def load_model(option):
|
|
|
|
|
|
|
90 |
if option == "BERT-based model":
|
91 |
model = HF_BertBasedModelAppDocs.from_pretrained("ferdmartin/HF_BertBasedModelAppDocs").to(device)
|
92 |
else:
|
@@ -95,7 +115,7 @@ def main():
|
|
95 |
|
96 |
|
97 |
# Streamlit app:
|
98 |
-
|
99 |
models_available = {"Logistic Regression":"models/baseline_model_lr2.joblib",
|
100 |
"Naive Bayes": "models/baseline_model_nb2.joblib",
|
101 |
"DistilBERT-based model (BERT light)": "ferdmartin/HF_DistilBertBasedModelAppDocs",
|
@@ -108,11 +128,12 @@ def main():
|
|
108 |
|
109 |
# Check the model to use
|
110 |
def restore_prediction_state():
|
|
|
111 |
if "prediction" in st.session_state:
|
112 |
del st.session_state.prediction
|
|
|
113 |
option = st.selectbox("Select a model to use:", models_available, on_change=restore_prediction_state)
|
114 |
|
115 |
-
|
116 |
# Load the selected trained model
|
117 |
if option in ("BERT-based model", "DistilBERT-based model (BERT light)"):
|
118 |
tokenizer = load_tokenizer(option)
|
@@ -135,20 +156,21 @@ def main():
|
|
135 |
# Use model
|
136 |
if st.button("Let's check this text!"):
|
137 |
if text.strip() == "":
|
|
|
138 |
st.error("Please enter some text")
|
139 |
else:
|
140 |
with st.spinner("Wait for the magic 🪄🔮"):
|
141 |
-
# Use
|
142 |
-
if option in ("Naive Bayes", "Logistic Regression"):
|
143 |
prediction, predict_proba = nb_lr(model, text)
|
144 |
st.session_state["sklearn"] = True
|
145 |
else:
|
146 |
-
prediction, predict_proba = torch_pred(tokenizer, model, text)
|
147 |
st.session_state["torch"] = True
|
148 |
|
149 |
# Store the result in session state
|
150 |
-
st.session_state["color_pred"] = "blue" if prediction == 0 else "red"
|
151 |
-
prediction = pred_str(prediction)
|
152 |
st.session_state["prediction"] = prediction
|
153 |
st.session_state["predict_proba"] = predict_proba
|
154 |
st.session_state["text"] = text
|
@@ -171,15 +193,14 @@ def main():
|
|
171 |
html = eli5.format_as_html(explainer.explain_prediction(target_names=["Human", "AI"]))
|
172 |
else:
|
173 |
with st.spinner('Wait for it 💭... BERT-based model explanations take around 4-10 minutes. In case you want to abort, refresh the page.'):
|
174 |
-
# TORCH EXPLAINER PRED FUNC (USES logits)
|
175 |
def f(x):
|
|
|
176 |
tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x])#.cuda()
|
177 |
outputs = model(tv).detach().cpu().numpy()
|
178 |
scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
|
179 |
val = scipy.special.logit(scores[:,1]) # use one vs rest logit units
|
180 |
return val
|
181 |
-
# build
|
182 |
-
explainer = shap.Explainer(f, tokenizer)
|
183 |
shap_values = explainer([st.session_state["text"]], fixed_context=1)
|
184 |
html = shap.plots.text(shap_values, display=False)
|
185 |
# Render HTML
|
|
|
49 |
translate(str.maketrans('', '', string.punctuation)).strip().lstrip()
|
50 |
|
51 |
# Define the function to classify text
|
52 |
+
def nb_lr(model, text: str) -> (int, float):
|
53 |
+
"""
|
54 |
+
This function takes a previously trained Sklearn Pipeline
|
55 |
+
model (NaiveBayes or Logistic Regression), then returns prediction probability,
|
56 |
+
and the final prediction as a tuple.
|
57 |
+
"""
|
58 |
# Clean and format the input text
|
59 |
text = format_text(text)
|
60 |
# Predict using either LR or NB and get prediction probability
|
|
|
63 |
return prediction, predict_proba
|
64 |
|
65 |
def torch_pred(tokenizer, model, text):
|
66 |
+
"""
|
67 |
+
This function takes a pre-trained tokenizer, a previously trained transformer-based model
|
68 |
+
model (DistilBert or Bert), then returns prediction probability,
|
69 |
+
and the final prediction as a tuple.
|
70 |
+
"""
|
71 |
# DL models (BERT/DistilBERT based models)
|
72 |
cleaned_text_tokens = tokenizer([text], padding='max_length', max_length=512, truncation=True)
|
73 |
with torch.inference_mode():
|
|
|
80 |
predict_proba = round(torch.softmax(logits, 1).cpu().squeeze().tolist()[prediction],4)
|
81 |
return prediction, predict_proba
|
82 |
|
83 |
+
def pred_str(prediction:int) -> str:
|
84 |
+
"""
|
85 |
+
This function takes an integer value as input and returns a string representing the type of the input's source.
|
86 |
+
The input is expected to be a prediction from a classification model that distinguishes between human-made and AI-generated text.
|
87 |
+
"""
|
88 |
# Map the predicted class to string output
|
89 |
if prediction == 0:
|
90 |
return "Human-made 🤷♂️🤷♀️"
|
|
|
93 |
|
94 |
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
95 |
def load_tokenizer(option):
|
96 |
+
"""
|
97 |
+
Load pre-trained tokenizer and and save in cache memory.
|
98 |
+
"""
|
99 |
if option == "BERT-based model":
|
100 |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding='max_length', max_length=512, truncation=True)
|
101 |
else:
|
|
|
104 |
|
105 |
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
106 |
def load_model(option):
|
107 |
+
"""
|
108 |
+
Load trained Transformer-based models and save in cache memory.
|
109 |
+
"""
|
110 |
if option == "BERT-based model":
|
111 |
model = HF_BertBasedModelAppDocs.from_pretrained("ferdmartin/HF_BertBasedModelAppDocs").to(device)
|
112 |
else:
|
|
|
115 |
|
116 |
|
117 |
# Streamlit app:
|
118 |
+
# List of models available
|
119 |
models_available = {"Logistic Regression":"models/baseline_model_lr2.joblib",
|
120 |
"Naive Bayes": "models/baseline_model_nb2.joblib",
|
121 |
"DistilBERT-based model (BERT light)": "ferdmartin/HF_DistilBertBasedModelAppDocs",
|
|
|
128 |
|
129 |
# Check the model to use
|
130 |
def restore_prediction_state():
|
131 |
+
"""Restore session_state variable to clear prediction after changing model"""
|
132 |
if "prediction" in st.session_state:
|
133 |
del st.session_state.prediction
|
134 |
+
|
135 |
option = st.selectbox("Select a model to use:", models_available, on_change=restore_prediction_state)
|
136 |
|
|
|
137 |
# Load the selected trained model
|
138 |
if option in ("BERT-based model", "DistilBERT-based model (BERT light)"):
|
139 |
tokenizer = load_tokenizer(option)
|
|
|
156 |
# Use model
|
157 |
if st.button("Let's check this text!"):
|
158 |
if text.strip() == "":
|
159 |
+
# In case there is no input for the model
|
160 |
st.error("Please enter some text")
|
161 |
else:
|
162 |
with st.spinner("Wait for the magic 🪄🔮"):
|
163 |
+
# Use models
|
164 |
+
if option in ("Naive Bayes", "Logistic Regression"): # Use Sklearn pipeline models
|
165 |
prediction, predict_proba = nb_lr(model, text)
|
166 |
st.session_state["sklearn"] = True
|
167 |
else:
|
168 |
+
prediction, predict_proba = torch_pred(tokenizer, model, text) # Use transformers
|
169 |
st.session_state["torch"] = True
|
170 |
|
171 |
# Store the result in session state
|
172 |
+
st.session_state["color_pred"] = "blue" if prediction == 0 else "red" # Set color for prediction output string
|
173 |
+
prediction = pred_str(prediction) # Map predictions (int => str)
|
174 |
st.session_state["prediction"] = prediction
|
175 |
st.session_state["predict_proba"] = predict_proba
|
176 |
st.session_state["text"] = text
|
|
|
193 |
html = eli5.format_as_html(explainer.explain_prediction(target_names=["Human", "AI"]))
|
194 |
else:
|
195 |
with st.spinner('Wait for it 💭... BERT-based model explanations take around 4-10 minutes. In case you want to abort, refresh the page.'):
|
|
|
196 |
def f(x):
|
197 |
+
"""TORCH EXPLAINER PRED FUNC (USES logits)"""
|
198 |
tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x])#.cuda()
|
199 |
outputs = model(tv).detach().cpu().numpy()
|
200 |
scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
|
201 |
val = scipy.special.logit(scores[:,1]) # use one vs rest logit units
|
202 |
return val
|
203 |
+
explainer = shap.Explainer(f, tokenizer) # build explainer using masking tokens and selected transformer-based model
|
|
|
204 |
shap_values = explainer([st.session_state["text"]], fixed_context=1)
|
205 |
html = shap.plots.text(shap_values, display=False)
|
206 |
# Render HTML
|