File size: 2,432 Bytes
8768724
 
 
 
 
fcb22a6
 
8768724
fcb22a6
 
8768724
fcb22a6
 
8768724
 
fcb22a6
8768724
fcb22a6
8768724
 
fcb22a6
8768724
 
 
 
fcb22a6
8768724
 
 
 
 
 
 
fcb22a6
8768724
 
 
 
 
 
fcb22a6
8768724
 
 
fcb22a6
 
8768724
fcb22a6
8768724
 
 
 
 
 
 
 
fcb22a6
8768724
 
fcb22a6
8768724
 
 
 
 
 
 
fcb22a6
8768724
 
fcb22a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8768724
fcb22a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer

# Read the dataset
df = pd.read_csv('Training_Essay_Data.csv')  # Make sure the file name is correct

# Splitting the dataset
train_df, eval_df = train_test_split(df, test_size=0.1)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


# Tokenize the dataset
train_encodings = tokenize_function(train_df)
eval_encodings = tokenize_function(eval_df)


# Essay dataset class
class EssayDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)


# Dataset preparation
train_dataset = EssayDataset(train_encodings, train_df['label'].tolist())
eval_dataset = EssayDataset(eval_encodings, eval_df['label'].tolist())

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./saved_model")

# Load the model for prediction
model = BertForSequenceClassification.from_pretrained("./saved_model")


# Predicting
def predict(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return "AI-generated" if predictions.item() == 1 else "Human-written"


# Get user input and predict
user_input = input("Enter the text you want to classify: ")
print("Classified as:", predict(user_input))