# --- IMPORTS --- import gradio as gr import torch from datasets import Dataset from transformers import Trainer, TrainingArguments, DataCollatorWithPadding from transformers import AutoTokenizer, AutoModelForSequenceClassification from sklearn.metrics import accuracy_score, precision_recall_fscore_support from sklearn.model_selection import train_test_split import pandas as pd import numpy as np import matplotlib.pyplot as plt import re import nltk from nltk.corpus import stopwords nltk.download('stopwords') stopwords = set(stopwords.words('english')) # ------------------------- # --- USEFUL FUNCTIONS ---- def clean_text(text): """ This function get's rid of nonalphabetical characters, stopwords and lower cases the text. Args: text (str): The text to be cleaned Returns: text (str): The cleaned text Example: df['text'] = df['text'].apply(clean_text) """ text = re.sub(r'[^a-zA-Z]', ' ', text) text = text.lower() words = text.split() text = [word for word in words if not word in stopwords] text = ' '.join(words) return text def tokenize_function(dataframe): """ This function tokenizes the 'text' field of the dataframe. Args: dataframe (pandas.DataFrame): The dataframe to be tokenized Returns: dataframe (pandas.DataFrame): The tokenized dataframe Example and output: train_dataset_token = train_dataset.map(tokenize_function, batched=True) """ return tokenizer(dataframe["text"], truncation=True) def compute_metrics(eval_pred): """ This function computes the accuracy, precision, recall and f1 score of the model. It'is passed to the trainer and it outputs when evaluating the model. Args: eval_pred (tuple): The predictions and labels of the model Returns: dict: The accuracy, precision, recall and f1 score of the model Example: >>> trainer.evaluate() { 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1 } """ predictions, labels = eval_pred predictions = predictions.argmax(axis=-1) accuracy = accuracy_score(labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support( labels, predictions, average='binary') return { 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1 } def predict(essay): """ This function makes a prediction based on the text input. Args: text (list): List of all essays to check. Returns: Prediction """ # --- DATA PREPROCESSING --- # Now we convert the input to a dataset df = pd.DataFrame({'text': [essay]}) # Get rid of nonalphatetical characters, stopwords and we lower case it. df['text'] = df['text'].apply(clean_text) # We convert the pandas dataframe into hugging face datasets and tokenize both of them ds = Dataset.from_pandas(df) ds_token = ds.map(tokenize_function, batched=True) # Drop columns that are not necessary and set the dataset format to pytorch tensors ds_token = ds_token.remove_columns(["text", "token_type_ids"]) ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask']) # ------------------------- # --- INSTANTIATING TRAINER ---- # We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Create the training arguments training_args = TrainingArguments(".") # Create the trainer trainer = Trainer( model, training_args, eval_dataset=ds_token, data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics ) # ------------------------- # --- PREDICT --- # We predict and then format the output predictions = trainer.predict(ds_token) predictions = torch.from_numpy(predictions.predictions) predictions = torch.nn.functional.softmax(predictions, dim=-1) results = [] index = torch.argmax(predictions[0]) confidence = round(predictions[0][index].item() * 100, 2) label = "HUMAN" if index == 0 else "AI" results.append(f'{label} with {confidence}% confidence.') return "\n".join(results) # ------------------------- # ------------------------- # --- LOADING THE MODEL --- # Load the initial tokenizer and model to set the number of labels its going to classify as 2 checkpoint = "diegovelilla/EssAI" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSequenceClassification.from_pretrained(checkpoint) # ------------------------- iface = gr.Interface( fn=predict, inputs=gr.Textbox( lines=2, placeholder="Enter your essay here...", label="Your essay"), outputs=gr.Textbox(label="Prediction Result"), title="EssAI", description="Detect AI-generated essays in a few seconds." ) # Launch the app if __name__ == "__main__": iface.launch()