ferdmartin commited on
Commit
71edabc
0 Parent(s):

Duplicate from ferdmartin/GradApplicationDocsApp

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.streamlit/config.toml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#6eb52f"
3
+ backgroundColor="#f0f0f5"
4
+ secondaryBackgroundColor="#e0e0ef"
5
+ textColor="#262730"
6
+ font="monospace"
7
+
8
+ [client]
9
+ showErrorDetails = false
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GradApplicationDocsApp
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.19.0
8
+ app_file: app.py
9
+ pinned: false
10
+ fullWidth: true
11
+ license: mit
12
+ duplicated_from: ferdmartin/GradApplicationDocsApp
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def main():
2
+ """
3
+ Creates a Streamlit web app that classifies a given body of text as either human-made or AI-generated,
4
+ using a pre-trained model.
5
+ """
6
+ import streamlit as st
7
+ import numpy as np
8
+ import joblib
9
+ import string
10
+ import time
11
+ import scipy
12
+ import spacy
13
+ import re
14
+ from transformers import AutoTokenizer
15
+ import torch
16
+ from eli5.lime import TextExplainer
17
+ from eli5.lime.samplers import MaskingTextSampler
18
+ import eli5
19
+ import shap
20
+ from custom_models import HF_DistilBertBasedModelAppDocs, HF_BertBasedModelAppDocs
21
+
22
+ # Initialize Spacy
23
+ nlp = spacy.load("en_core_web_sm")
24
+
25
+ # device to run DL model
26
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
27
+
28
+ def format_text(text: str) -> str:
29
+ """
30
+ This function takes a string as input and returns a formatted version of the string.
31
+ The function replaces specific substrings in the input string with empty strings,
32
+ converts the string to lowercase, removes any leading or trailing whitespace,
33
+ and removes any punctuation from the string.
34
+ """
35
+
36
+ text = nlp(text)
37
+ text = " ".join([token.text for token in text if token.ent_type_ not in ["PERSON", "DATE"]])
38
+
39
+ pattern = r"\b[A-Za-z]+\d+\b"
40
+ text = re.sub(pattern, "", text)
41
+
42
+ return text.replace("REDACTED", "").lower().replace("[Name]", "").replace("[your name]", "").\
43
+ replace("dear admissions committee,", "").replace("sincerely,","").\
44
+ replace("[university's name]","fordham").replace("dear sir/madam,","").\
45
+ replace("– statement of intent ","").\
46
+ replace('program: master of science in data analytics name of applicant: ',"").\
47
+ replace("data analytics", "data science").replace("| \u200b","").\
48
+ replace("m.s. in data science at lincoln center ","").\
49
+ translate(str.maketrans('', '', string.punctuation)).strip().lstrip()
50
+
51
+ # Define the function to classify text
52
+ def nb_lr(model, text):
53
+ # Clean and format the input text
54
+ text = format_text(text)
55
+ # Predict using either LR or NB and get prediction probability
56
+ prediction = model.predict([text]).item()
57
+ predict_proba = round(model.predict_proba([text]).squeeze()[prediction].item(),4)
58
+ return prediction, predict_proba
59
+
60
+ def torch_pred(tokenizer, model, text):
61
+ # DL models (BERT/DistilBERT based models)
62
+ cleaned_text_tokens = tokenizer([text], padding='max_length', max_length=512, truncation=True)
63
+ with torch.inference_mode():
64
+ input_ids, att = cleaned_text_tokens["input_ids"], cleaned_text_tokens["attention_mask"]
65
+ input_ids = torch.tensor(input_ids).to(device)
66
+ attention_mask = torch.tensor(att).to(device)
67
+ logits = model(input_ids=input_ids, attention_mask=attention_mask)
68
+ _, prediction = torch.max(logits, 1)
69
+ prediction = prediction.item()
70
+ predict_proba = round(torch.softmax(logits, 1).numpy().squeeze()[prediction].item(),4)
71
+ return prediction, predict_proba
72
+
73
+ def pred_str(prediction):
74
+ # Map the predicted class to string output
75
+ if prediction == 0:
76
+ return "Human-made 🤷‍♂️🤷‍♀️"
77
+ else:
78
+ return "Generated with AI 🦾"
79
+
80
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
81
+ def load_tokenizer(option):
82
+ if option == "BERT-based model":
83
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding='max_length', max_length=512, truncation=True)
84
+ else:
85
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", padding='max_length', max_length=512, truncation=True)
86
+ return tokenizer
87
+
88
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
89
+ def load_model(option):
90
+ if option == "BERT-based model":
91
+ model = HF_BertBasedModelAppDocs.from_pretrained("ferdmartin/HF_BertBasedModelAppDocs").to(device)
92
+ else:
93
+ model = HF_DistilBertBasedModelAppDocs.from_pretrained("ferdmartin/HF_DistilBertBasedModelAppDocs").to(device)
94
+ return model
95
+
96
+
97
+ # Streamlit app:
98
+
99
+ models_available = {"Logistic Regression":"models/baseline_model_lr2.joblib",
100
+ "Naive Bayes": "models/baseline_model_nb2.joblib",
101
+ "DistilBERT-based model (BERT light)": "ferdmartin/HF_DistilBertBasedModelAppDocs",
102
+ "BERT-based model": "ferdmartin/HF_BertBasedModelAppDocs"
103
+ }
104
+
105
+ st.set_page_config(layout="wide")
106
+ st.title("Academic Application Document Classifier")
107
+ st.header("Is it human-made 📝 or Generated with AI 🤖 ? ")
108
+
109
+ # Check the model to use
110
+ option = st.selectbox("Select a model to use:", models_available)
111
+
112
+ # Load the selected trained model
113
+ if option in ("BERT-based model", "DistilBERT-based model (BERT light)"):
114
+ tokenizer = load_tokenizer(option)
115
+ model = load_model(option)
116
+ else:
117
+ model = joblib.load(models_available[option])
118
+
119
+
120
+ text = st.text_area("Enter either a statement of intent or a letter of recommendation:")
121
+
122
+ #Hide footer "made with streamlit"
123
+ hide_st_style = """
124
+ <style>
125
+ footer {visibility: hidden;}
126
+ header {visibility: hidden;}
127
+ </style>
128
+ """
129
+ st.markdown(hide_st_style, unsafe_allow_html=True)
130
+
131
+ # Use model
132
+ if st.button("Let's check this text!"):
133
+ if text.strip() == "":
134
+ st.error("Please enter some text")
135
+ else:
136
+ # # Add a progress bar
137
+ # progress_bar = st.progress(0)
138
+
139
+ # # Add a placeholder for the progress message
140
+ # status_text = st.empty()
141
+
142
+ # # Simulate a long-running process
143
+ # for i in range(100):
144
+ # # Update the progress bar every 0.02 seconds
145
+ # time.sleep(0.05)
146
+ # progress_bar.progress(i + 1)
147
+
148
+ # if i % 2 == 0:
149
+ # magic = "✨"
150
+ # else:
151
+ # magic = ""
152
+ # # Update the progress message
153
+ # status_text.write(f"Work in progress {i + 1}%... Wait for the magic 🪄🔮{magic}")
154
+ # # Clear the progress bar and status message
155
+ # progress_bar.empty()
156
+ # status_text.empty()
157
+ with st.spinner("Wait for the magic 🪄🔮"):
158
+ # Use model
159
+ if option in ("Naive Bayes", "Logistic Regression"):
160
+ prediction, predict_proba = nb_lr(model, text)
161
+ st.session_state["sklearn"] = True
162
+ else:
163
+ prediction, predict_proba = torch_pred(tokenizer, model, text)
164
+ st.session_state["torch"] = True
165
+
166
+ # Store the result in session state
167
+ st.session_state["color_pred"] = "blue" if prediction == 0 else "red"
168
+ prediction = pred_str(prediction)
169
+ st.session_state["prediction"] = prediction
170
+ st.session_state["predict_proba"] = predict_proba
171
+ st.session_state["text"] = text
172
+
173
+ # Print result
174
+ # st.write(f"<span style='font-size: 24px;'>I think this text is: {prediction}</span>",
175
+ # unsafe_allow_html=True)
176
+ st.markdown(f"I think this text is: **:{st.session_state['color_pred']}[{st.session_state['prediction']}]** (Prediction probability: {st.session_state['predict_proba'] * 100}%)")
177
+ # help=f"I estimate that its probability is {st.session_state['predict_proba'] * 100}%")
178
+
179
+ elif "prediction" in st.session_state:
180
+ # Display the stored result if available
181
+ # st.write(f"<span style='font-size: 24px;'>I think this text is: {st.session_state['prediction']}</span>", unsafe_allow_html=True)
182
+
183
+ st.markdown(f"I think this text is: **:{st.session_state['color_pred']}[{st.session_state['prediction']}]** (Prediction probability: {st.session_state['predict_proba'] * 100}%)")
184
+ #help=f"I estimate that its probability is {st.session_state['predict_proba'] * 100}%")
185
+ #**:blue[colored]**
186
+ if st.button("Model Explanation"):
187
+ # Check if there's text in the session state
188
+ if "text" in st.session_state:
189
+
190
+ if option in ("Naive Bayes", "Logistic Regression"):
191
+ with st.spinner('Wait for it 💭...'):
192
+ explainer = TextExplainer(sampler=MaskingTextSampler())
193
+ explainer.fit(st.session_state["text"], model.predict_proba)
194
+ html = eli5.format_as_html(explainer.explain_prediction(target_names=["Human", "AI"]))
195
+ else:
196
+ with st.spinner('Wait for it 💭... BERT-based model explanations take around 4-10 minutes. In case you want to abort, refresh the page.'):
197
+ # TORCH EXPLAINER PRED FUNC (USES logits)
198
+ def f(x):
199
+ tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x])#.cuda()
200
+ outputs = model(tv).detach().cpu().numpy()
201
+ scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
202
+ val = scipy.special.logit(scores[:,1]) # use one vs rest logit units
203
+ return val
204
+ # build an explainer using a token masker
205
+ explainer = shap.Explainer(f, tokenizer)
206
+ shap_values = explainer([st.session_state["text"]], fixed_context=1)
207
+ html = shap.plots.text(shap_values, display=False)
208
+ # Render HTML
209
+ st.components.v1.html(html, height=500, scrolling = True)
210
+ else:
211
+ st.error("Please enter some text and click 'Let's check!' before requesting an explanation.")
212
+
213
+ if __name__ == "__main__":
214
+ main()
custom_models.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from transformers import PreTrainedModel, PretrainedConfig, DistilBertModel, BertModel
3
+ import torch
4
+ from torch import nn
5
+
6
+
7
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
8
+
9
+ class TransformerBasedModelDistilBert(nn.Module):
10
+ def __init__(self):
11
+ super(TransformerBasedModelDistilBert, self).__init__()
12
+ self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
13
+ self.dropout = nn.Dropout(0.55)
14
+ self.fc = nn.Linear(768, 2)
15
+
16
+ def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
17
+ input_shape = input_ids.size()
18
+ if attention_mask is None:
19
+ attention_mask = torch.ones(input_shape, device=device)
20
+
21
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
22
+ pooled_output = outputs.last_hidden_state[:, 0, :]
23
+ pooled_output = self.dropout(pooled_output)
24
+ logits = self.fc(pooled_output)
25
+ return logits
26
+
27
+ class TransformerBasedModelBert(nn.Module):
28
+ def __init__(self):
29
+ super(TransformerBasedModelBert, self).__init__()
30
+ self.bert = BertModel.from_pretrained('bert-base-uncased')
31
+ self.dropout = nn.Dropout(0.55)
32
+ self.fc = nn.Linear(768, 2)
33
+
34
+ def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
35
+ input_shape = input_ids.size()
36
+ if attention_mask is None:
37
+ attention_mask = torch.ones(input_shape, device=device)
38
+
39
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
40
+ pooled_output = outputs[1]
41
+ pooled_output = self.dropout(pooled_output)
42
+ logits = self.fc(pooled_output)
43
+ return logits
44
+
45
+ class MyConfigDistil(PretrainedConfig):
46
+ model_type = "distilbert"
47
+ def __init__(self, final_dropout=0.55, **kwargs):
48
+ super().__init__(**kwargs)
49
+ self.final_dropout = final_dropout
50
+
51
+ class MyConfig(PretrainedConfig):
52
+ model_type = "bert"
53
+ def __init__(self, final_dropout=0.55, **kwargs):
54
+ super().__init__(**kwargs)
55
+ self.final_dropout = final_dropout
56
+
57
+ class MyHFModel_DistilBertBased(PreTrainedModel):
58
+ config_class = MyConfigDistil
59
+ def __init__(self, config):
60
+ super().__init__(config)
61
+ self.config = config
62
+ self.model = TransformerBasedModelDistilBert()
63
+ def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
64
+ input_shape = input_ids.size()
65
+ if attention_mask is None:
66
+ attention_mask = torch.ones(input_shape, device=device)
67
+
68
+ return self.model(input_ids=input_ids, attention_mask=attention_mask)
69
+
70
+ class MyHFModel_BertBased(PreTrainedModel):
71
+ config_class = MyConfig
72
+ def __init__(self, config):
73
+ super().__init__(config)
74
+ self.config = config
75
+ self.model = TransformerBasedModelBert()
76
+ def forward(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
77
+ input_shape = input_ids.size()
78
+ if attention_mask is None:
79
+ attention_mask = torch.ones(input_shape, device=device)
80
+
81
+ return self.model(input_ids=input_ids, attention_mask=attention_mask)
82
+
83
+ config = MyConfigDistil(0.55)
84
+ HF_DistilBertBasedModelAppDocs = MyHFModel_DistilBertBased(config)
85
+
86
+ config_db = MyConfig(0.55)
87
+ HF_BertBasedModelAppDocs = MyHFModel_BertBased(config_db)
models/baseline_model_lr2.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd46197ecc509dbc783af2ca0e2fac048e76a21047a9b6f52f8cc761dcfdc665
3
+ size 2206719
models/baseline_model_nb2.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4d8e375acf6e9bf81062baf862cdcf84fd78c236dea4c649f9e8e648b0c8712
3
+ size 3371095
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ psutil
3
+ scikit-learn==1.0.2
4
+ scipy==1.7.3
5
+ shap
6
+ numpy==1.21.6
7
+ matplotlib==3.5.3
8
+ eli5==0.13.0
9
+ torch
10
+ transformers
11
+ spacy
12
+ https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl