diegovelilla commited on
Commit
b9c3ba7
1 Parent(s): 786c2e2

first upload of app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -0
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- IMPORTS ---
2
+
3
+ import gradio as gr
4
+ import torch
5
+ from datasets import Dataset
6
+ from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
9
+ from sklearn.model_selection import train_test_split
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import re
14
+ import nltk
15
+ from nltk.corpus import stopwords
16
+ nltk.download('stopwords')
17
+ stopwords = set(stopwords.words('english'))
18
+
19
+ # -------------------------
20
+
21
+ # --- USEFUL FUNCTIONS ----
22
+
23
+
24
+ def clean_text(text):
25
+ """
26
+ This function get's rid of nonalphabetical characters, stopwords and lower cases the text.
27
+
28
+ Args:
29
+ text (str): The text to be cleaned
30
+
31
+ Returns:
32
+ text (str): The cleaned text
33
+
34
+ Example:
35
+ df['text'] = df['text'].apply(clean_text)
36
+ """
37
+ text = re.sub(r'[^a-zA-Z]', ' ', text)
38
+ text = text.lower()
39
+ words = text.split()
40
+ text = [word for word in words if not word in stopwords]
41
+ text = ' '.join(words)
42
+ return text
43
+
44
+
45
+ def tokenize_function(dataframe):
46
+ """
47
+ This function tokenizes the 'text' field of the dataframe.
48
+
49
+ Args:
50
+ dataframe (pandas.DataFrame): The dataframe to be tokenized
51
+
52
+ Returns:
53
+ dataframe (pandas.DataFrame): The tokenized dataframe
54
+
55
+ Example and output:
56
+ train_dataset_token = train_dataset.map(tokenize_function, batched=True)
57
+ """
58
+ return tokenizer(dataframe["text"], truncation=True)
59
+
60
+
61
+ def compute_metrics(eval_pred):
62
+ """
63
+ This function computes the accuracy, precision, recall and f1 score of the model.
64
+
65
+ It'is passed to the trainer and it outputs when evaluating the model.
66
+
67
+ Args:
68
+ eval_pred (tuple): The predictions and labels of the model
69
+
70
+ Returns:
71
+ dict: The accuracy, precision, recall and f1 score of the model
72
+
73
+ Example:
74
+ >>> trainer.evaluate()
75
+ {
76
+ 'accuracy': accuracy,
77
+ 'precision': precision,
78
+ 'recall': recall,
79
+ 'f1': f1
80
+ }
81
+ """
82
+ predictions, labels = eval_pred
83
+ predictions = predictions.argmax(axis=-1)
84
+ accuracy = accuracy_score(labels, predictions)
85
+ precision, recall, f1, _ = precision_recall_fscore_support(
86
+ labels, predictions, average='binary')
87
+ return {
88
+ 'accuracy': accuracy,
89
+ 'precision': precision,
90
+ 'recall': recall,
91
+ 'f1': f1
92
+ }
93
+
94
+
95
+ def predict(essay):
96
+ """
97
+ This function makes a prediction based on the text input.
98
+
99
+ Args:
100
+ text (list): List of all essays to check.
101
+
102
+ Returns:
103
+ Prediction
104
+
105
+ """
106
+ # --- DATA PREPROCESSING ---
107
+
108
+ # Now we convert the input to a dataset
109
+ df = pd.DataFrame({'text': [essay]})
110
+
111
+ # Get rid of nonalphatetical characters, stopwords and we lower case it.
112
+ df['text'] = df['text'].apply(clean_text)
113
+
114
+ # We convert the pandas dataframe into hugging face datasets and tokenize both of them
115
+ ds = Dataset.from_pandas(df)
116
+ ds_token = ds.map(tokenize_function, batched=True)
117
+
118
+ # Drop columns that are not necessary and set the dataset format to pytorch tensors
119
+ ds_token = ds_token.remove_columns(["text", "token_type_ids"])
120
+ ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask'])
121
+
122
+ # -------------------------
123
+
124
+ # --- INSTANTIATING TRAINER ----
125
+
126
+ # We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training
127
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
128
+
129
+ # Create the training arguments
130
+ training_args = TrainingArguments(".")
131
+
132
+ # Create the trainer
133
+ trainer = Trainer(
134
+ model,
135
+ training_args,
136
+ eval_dataset=ds_token,
137
+ data_collator=data_collator,
138
+ tokenizer=tokenizer,
139
+ compute_metrics=compute_metrics
140
+ )
141
+
142
+ # -------------------------
143
+
144
+ # --- PREDICT ---
145
+
146
+ # We predict and then format the output
147
+
148
+ predictions = trainer.predict(ds_token)
149
+ predictions = torch.from_numpy(predictions.predictions)
150
+ predictions = torch.nn.functional.softmax(predictions, dim=-1)
151
+ results = []
152
+ index = torch.argmax(predictions[0])
153
+ confidence = round(predictions[0][index].item() * 100, 2)
154
+ label = "HUMAN" if index == 0 else "AI"
155
+ results.append(f'{label} with {confidence}% confidence.')
156
+
157
+ return "\n".join(results)
158
+ # -------------------------
159
+
160
+ # -------------------------
161
+
162
+ # --- LOADING THE MODEL ---
163
+
164
+
165
+ # Load the initial tokenizer and model to set the number of labels its going to classify as 2
166
+ checkpoint = "diegovelilla/EssAI"
167
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
168
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
169
+
170
+ # -------------------------
171
+
172
+ iface = gr.Interface(
173
+ fn=predict,
174
+ inputs=gr.Textbox(
175
+ lines=2, placeholder="Enter your essay here...", label="Your essay"),
176
+ outputs=gr.Textbox(label="Prediction Result"),
177
+ title="EssAI",
178
+ description="Detect AI-generated essays in a few seconds."
179
+ )
180
+
181
+ # Launch the app
182
+ if __name__ == "__main__":
183
+ iface.launch()