from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer import joblib import pandas as pd from datetime import datetime import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class TextClassificationPipeline: def __init__(self, model_path='./models', method='bertbased'): """ Initialize the classification pipeline Args: model_path: Path to saved models method: 'bertbased' or 'baseline' """ try: self.method = method if method == 'bertbased': logger.info("Loading BERT model...") self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') self.model = AutoModelForSequenceClassification.from_pretrained(f"{model_path}/bert-model") self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model.to(self.device) self.model.eval() logger.info(f"BERT model loaded successfully. Using device: {self.device}") else: logger.info("Loading baseline model...") self.tfidf = joblib.load(f"{model_path}/baseline-model/tfidf_vectorizer.pkl") self.baseline_model = joblib.load(f"{model_path}/baseline-model/baseline_model.pkl") logger.info("Baseline model loaded successfully") # Load label encoder for both methods self.label_encoder = joblib.load(f"{model_path}/label_encoder.pkl") except Exception as e: logger.error(f"Error initializing model: {str(e)}") raise # def preprocess_text(self, text): # """Clean and preprocess text""" # if isinstance(text, str): # # Basic cleaning # text = text.strip() # text = ' '.join(text.split()) # Remove extra whitespace # return text # return text def preprocess_text(self, text): """Clean and preprocess text""" if isinstance(text, str): # Basic cleaning text = text.strip() text = ' '.join(text.split()) # Remove extra whitespace # Capitalize first letter to match training data format text = text.title() # This will capitalize first letter of each word return text return text def preprocess(self, text): """ Preprocess the input text based on method """ try: # Clean text first text = self.preprocess_text(text) if self.method == 'bertbased': # BERT preprocessing encodings = self.tokenizer( text, truncation=True, padding=True, max_length=512, return_tensors='pt' ) encodings = {k: v.to(self.device) for k, v in encodings.items()} return encodings else: # Baseline preprocessing return self.tfidf.transform([text] if isinstance(text, str) else text) except Exception as e: logger.error(f"Error in preprocessing: {str(e)}") raise def predict(self, text, return_probability=False): """ Predict using either BERT or baseline model Args: text: Input text or list of texts return_probability: Whether to return probability scores Returns: Predictions with metadata """ try: # Handle both single string and list of strings if isinstance(text, str): text = [text] # Preprocess inputs = self.preprocess(text) if self.method == 'bertbased': # BERT predictions with torch.no_grad(): outputs = self.model(**inputs) probabilities = torch.softmax(outputs.logits, dim=-1) predictions = torch.argmax(probabilities, dim=-1) predictions = predictions.cpu().numpy() probabilities = probabilities.cpu().numpy() else: # Baseline predictions predictions = self.baseline_model.predict(inputs) probabilities = self.baseline_model.predict_proba(inputs) # Convert numeric predictions to original labels predicted_labels = self.label_encoder.inverse_transform(predictions) # Ensure consistent casing with training data predicted_labels = [label.title() for label in predicted_labels] if return_probability: results = [] for t, label, prob, probs in zip(text, predicted_labels, probabilities.max(axis=1), probabilities): result = { 'text': t[:200] + '...' if len(t) > 200 else t, 'predicted_label': label.title(), # Ensure consistent casing 'confidence': float(prob), 'model_type': self.method, 'probabilities': { self.label_encoder.inverse_transform([i])[0].title(): float(p) # Consistent casing for i, p in enumerate(probs) }, # ... rest of the result dictionary ... } results.append(result) return results[0] if len(text) == 1 else results return predicted_labels[0] if len(text) == 1 else predicted_labels except Exception as e: logger.error(f"Error in prediction: {str(e)}") raise def predict_old(self, text, return_probability=False): """ Predict using either BERT or baseline model Args: text: Input text or list of texts return_probability: Whether to return probability scores Returns: Predictions with metadata """ try: # Handle both single string and list of strings if isinstance(text, str): text = [text] # Preprocess inputs = self.preprocess(text) if self.method == 'bertbased': # BERT predictions with torch.no_grad(): outputs = self.model(**inputs) probabilities = torch.softmax(outputs.logits, dim=-1) predictions = torch.argmax(probabilities, dim=-1) predictions = predictions.cpu().numpy() probabilities = probabilities.cpu().numpy() else: # Baseline predictions predictions = self.baseline_model.predict(inputs) probabilities = self.baseline_model.predict_proba(inputs) # Convert numeric predictions to original labels predicted_labels = self.label_encoder.inverse_transform(predictions) if return_probability: results = [] for t, label, prob, probs in zip(text, predicted_labels, probabilities.max(axis=1), probabilities): # Create detailed result dictionary result = { 'text': t[:200] + '...' if len(t) > 200 else t, # Truncate long text 'predicted_label': label, 'confidence': float(prob), 'model_type': self.method, 'probabilities': { self.label_encoder.inverse_transform([i])[0]: float(p) for i, p in enumerate(probs) }, 'timestamp': datetime.now().isoformat(), 'metadata': { 'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline', 'text_length': len(t), 'preprocessing_steps': ['cleaning', 'tokenization'] } } results.append(result) return results[0] if len(text) == 1 else results return predicted_labels[0] if len(text) == 1 else predicted_labels except Exception as e: logger.error(f"Error in prediction: {str(e)}") raise def get_model_info(self): """Return model information""" return { 'model_type': self.method, 'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline', 'device': str(self.device) if self.method == 'bertbased' else 'CPU', 'max_sequence_length': 512 if self.method == 'bertbased' else None, 'number_of_classes': len(self.label_encoder.classes_), 'classes': list(self.label_encoder.classes_) } def load_and_process_pdf(url_or_file): """ Load and process PDF from URL or file Returns extracted text """ try: # Your PDF processing code here # Return extracted text pass except Exception as e: logger.error(f"Error processing PDF: {str(e)}") raise # Example usage if __name__ == "__main__": # Test the pipeline classifier = TextClassificationPipeline() # Test single prediction text = "Example construction document text" result = classifier.predict(text, return_probability=True) print("\nSingle Prediction Result:") print(result) # Test batch prediction texts = ["First document", "Second document"] results = classifier.predict(texts, return_probability=True) print("\nBatch Prediction Results:") for result in results: print(f"\nText: {result['text']}") print(f"Prediction: {result['predicted_label']}") print(f"Confidence: {result['confidence']:.4f}")