# Import Libraries import torch from torch.utils.data import DataLoader from transformers import BertTokenizer, BertForSequenceClassification, AdamW, pipeline from transformers import get_scheduler from datasets import load_dataset from sklearn.metrics import accuracy_score, classification_report import gradio as gr import numpy as np import random # Set Random Seeds for Reproducibility torch.manual_seed(42) random.seed(42) np.random.seed(42) # Load IMDb Dataset dataset = load_dataset('imdb') # Load Pretrained Tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokenization Function def tokenize_function(batch): return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128) # Tokenize the Dataset tokenized_datasets = dataset.map(tokenize_function, batched=True) # Remove the Original Text to Save Memory tokenized_datasets = tokenized_datasets.remove_columns(['text']) # Rename 'label' to 'labels' for Compatibility with Transformers tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # Set Dataset Format for PyTorch tokenized_datasets.set_format("torch") # Split the Data train_dataset = tokenized_datasets["train"] test_dataset = tokenized_datasets["test"] # Create Data Loaders train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=16) # Load Pretrained BERT Model for Sequence Classification model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) # Define Optimizer optimizer = AdamW(model.parameters(), lr=5e-5) # Learning Rate Scheduler num_training_steps = len(train_loader) * 3 # 3 epochs lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps) # Move Model to GPU if Available device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model.to(device) # Training Loop def train_model(): model.train() for epoch in range(3): # 3 Epochs print(f"Epoch {epoch+1}") for batch in train_loader: # Move Batch to Device batch = {k: v.to(device) for k, v in batch.items()} outputs = model(**batch) loss = outputs.loss # Backpropagation loss.backward() optimizer.step() lr_scheduler.step() optimizer.zero_grad() print(f"Loss: {loss.item()}") # Evaluation Function def evaluate_model(): model.eval() preds, labels = [], [] with torch.no_grad(): for batch in test_loader: batch = {k: v.to(device) for k, v in batch.items()} outputs = model(**batch) logits = outputs.logits preds.extend(torch.argmax(logits, axis=1).cpu().numpy()) labels.extend(batch["labels"].cpu().numpy()) accuracy = accuracy_score(labels, preds) print("Accuracy:", accuracy) print("Classification Report:\n", classification_report(labels, preds)) # Train and Evaluate the Model train_model() evaluate_model() # Save the Model for Deployment model.save_pretrained("sentiment_model") tokenizer.save_pretrained("sentiment_model") # Deploy the Model with Gradio sentiment_pipeline = pipeline("sentiment-analysis", model="sentiment_model") # Gradio Inference Function def analyze_sentiment(review): result = sentiment_pipeline(review) return result[0]['label'] # Gradio Interface iface = gr.Interface( fn=analyze_sentiment, inputs=gr.Textbox(lines=5, placeholder="Enter a movie review..."), outputs="text", title="IMDb Sentiment Analysis", ) # Launch the Gradio App iface.launch()