|
import pandas as pd |
|
import torch |
|
from sklearn.model_selection import train_test_split |
|
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer |
|
|
|
|
|
df = pd.read_csv('Training_Essay_Data.csv') |
|
|
|
|
|
train_df, eval_df = train_test_split(df, test_size=0.1) |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) |
|
|
|
|
|
|
|
train_encodings = tokenize_function(train_df) |
|
eval_encodings = tokenize_function(eval_df) |
|
|
|
|
|
|
|
class EssayDataset(torch.utils.data.Dataset): |
|
def __init__(self, encodings, labels): |
|
self.encodings = encodings |
|
self.labels = labels |
|
|
|
def __getitem__(self, idx): |
|
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} |
|
item['labels'] = torch.tensor(int(self.labels[idx])) |
|
return item |
|
|
|
def __len__(self): |
|
return len(self.labels) |
|
|
|
|
|
|
|
train_dataset = EssayDataset(train_encodings, train_df['label'].tolist()) |
|
eval_dataset = EssayDataset(eval_encodings, eval_df['label'].tolist()) |
|
|
|
|
|
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='./results', |
|
num_train_epochs=3, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=64, |
|
warmup_steps=500, |
|
weight_decay=0.01, |
|
logging_dir='./logs', |
|
evaluation_strategy="epoch" |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
model.save_pretrained("./saved_model") |
|
|
|
|
|
model = BertForSequenceClassification.from_pretrained("./saved_model") |
|
|
|
|
|
|
|
def predict(text): |
|
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt") |
|
outputs = model(**inputs) |
|
predictions = torch.argmax(outputs.logits, dim=-1) |
|
return "AI-generated" if predictions.item() == 1 else "Human-written" |
|
|
|
|
|
|
|
user_input = input("Enter the text you want to classify: ") |
|
print("Classified as:", predict(user_input)) |
|
|