|
|
|
|
|
import torch |
|
from datasets import Dataset |
|
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
from sklearn.model_selection import train_test_split |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import re |
|
import nltk |
|
from nltk.corpus import stopwords |
|
nltk.download('stopwords') |
|
stopwords = set(stopwords.words('english')) |
|
|
|
|
|
|
|
|
|
|
|
input_list = [""" WRITE HERE YOUR FIRST ESSAY """, |
|
""" WRITE HERE YOUR SECOND ESSAY """] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_text(text): |
|
""" |
|
This funtion get's rid of nonalphabetical characters, stopwords and lower cases the text. |
|
|
|
Args: |
|
text (str): The text to be cleaned |
|
|
|
Returns: |
|
text (str): The cleaned text |
|
|
|
Example: |
|
df['text'] = df['text'].apply(clean_text) |
|
""" |
|
text = re.sub(r'[^a-zA-Z]', ' ', text) |
|
text = text.lower() |
|
words = text.split() |
|
text = [word for word in words if not word in stopwords] |
|
text = ' '.join(words) |
|
return text |
|
|
|
|
|
def tokenize_function(dataframe): |
|
""" |
|
This funtion tokenizes the 'text' field of the dataframe. |
|
|
|
Args: |
|
dataframe (pandas.DataFrame): The dataframe to be tokenized |
|
|
|
Returns: |
|
dataframe (pandas.DataFrame): The tokenized dataframe |
|
|
|
Example and output: |
|
train_dataset_token = train_dataset.map(tokenize_function, batched=True) |
|
""" |
|
return tokenizer(dataframe["text"], truncation=True) |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
""" |
|
This funtion computes the accuracy, precision, recall and f1 score of the model. |
|
|
|
It'is passed to the trainer and it outputs when evaluating the model. |
|
|
|
Args: |
|
eval_pred (tuple): The predictions and labels of the model |
|
|
|
Returns: |
|
dict: The accuracy, precision, recall and f1 score of the model |
|
|
|
Example: |
|
>>> trainer.evaluate() |
|
{ |
|
'accuracy': accuracy, |
|
'precision': precision, |
|
'recall': recall, |
|
'f1': f1 |
|
} |
|
""" |
|
predictions, labels = eval_pred |
|
predictions = predictions.argmax(axis=-1) |
|
accuracy = accuracy_score(labels, predictions) |
|
precision, recall, f1, _ = precision_recall_fscore_support( |
|
labels, predictions, average='binary') |
|
return { |
|
'accuracy': accuracy, |
|
'precision': precision, |
|
'recall': recall, |
|
'f1': f1 |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoint = "diegovelilla/EssAI" |
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) |
|
|
|
|
|
|
|
|
|
|
|
n_input = len(input_list) |
|
|
|
|
|
df = pd.DataFrame({'text': input_list}) |
|
|
|
|
|
|
|
df['text'] = df['text'].apply(clean_text) |
|
|
|
|
|
ds = Dataset.from_pandas(df) |
|
ds_token = ds.map(tokenize_function, batched=True) |
|
|
|
|
|
ds_token = ds_token.remove_columns(["text", "token_type_ids"]) |
|
ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
|
|
|
training_args = TrainingArguments(".") |
|
|
|
|
|
trainer = Trainer( |
|
model, |
|
training_args, |
|
eval_dataset=ds_token, |
|
data_collator=data_collator, |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
predictions = trainer.predict(ds_token) |
|
predictions = torch.from_numpy(predictions.predictions) |
|
predictions = torch.nn.functional.softmax(predictions, dim=-1) |
|
|
|
print('\n\n') |
|
for i in range(n_input): |
|
index = torch.argmax(predictions[i]) |
|
print(f'{i+1}: HUMAN {round(predictions[i][0].item() * 100, 2)}% of confidence.') if index == 0 else print( |
|
f'{i+1}: AI {round(predictions[i][1].item() * 100, 2)}% of confidence.') |
|
|
|
|
|
|