Spaces:

mlkorra
/

Product-doc-classifier

Sleeping

File size: 10,644 Bytes

a20a7ca

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import pandas as pd
from datetime import datetime
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TextClassificationPipeline:
    def __init__(self, model_path='./models', method='bertbased'):
        """
        Initialize the classification pipeline
        Args:
            model_path: Path to saved models
            method: 'bertbased' or 'baseline'
        """
        try:
            self.method = method
            
            if method == 'bertbased':
                logger.info("Loading BERT model...")
                self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
                self.model = AutoModelForSequenceClassification.from_pretrained(f"{model_path}/bert-model")
                self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                self.model.to(self.device)
                self.model.eval()
                logger.info(f"BERT model loaded successfully. Using device: {self.device}")
            else:
                logger.info("Loading baseline model...")
                self.tfidf = joblib.load(f"{model_path}/baseline-model/tfidf_vectorizer.pkl")
                self.baseline_model = joblib.load(f"{model_path}/baseline-model/baseline_model.pkl")
                logger.info("Baseline model loaded successfully")
            
            # Load label encoder for both methods
            self.label_encoder = joblib.load(f"{model_path}/label_encoder.pkl")
            
        except Exception as e:
            logger.error(f"Error initializing model: {str(e)}")
            raise
    
    # def preprocess_text(self, text):
    #     """Clean and preprocess text"""
    #     if isinstance(text, str):
    #         # Basic cleaning
    #         text = text.strip()
    #         text = ' '.join(text.split())  # Remove extra whitespace
    #         return text
    #     return text
    def preprocess_text(self, text):
        """Clean and preprocess text"""
        if isinstance(text, str):
            # Basic cleaning
            text = text.strip()
            text = ' '.join(text.split())  # Remove extra whitespace
            # Capitalize first letter to match training data format
            text = text.title()  # This will capitalize first letter of each word
            return text
        return text
    
    def preprocess(self, text):
        """
        Preprocess the input text based on method
        """
        try:
            # Clean text first
            text = self.preprocess_text(text)
            
            if self.method == 'bertbased':
                # BERT preprocessing
                encodings = self.tokenizer(
                    text,
                    truncation=True,
                    padding=True,
                    max_length=512,
                    return_tensors='pt'
                )
                encodings = {k: v.to(self.device) for k, v in encodings.items()}
                return encodings
            else:
                # Baseline preprocessing
                return self.tfidf.transform([text] if isinstance(text, str) else text)
                
        except Exception as e:
            logger.error(f"Error in preprocessing: {str(e)}")
            raise
    
    def predict(self, text, return_probability=False):
        """
        Predict using either BERT or baseline model
        Args:
            text: Input text or list of texts
            return_probability: Whether to return probability scores
        Returns:
            Predictions with metadata
        """
        try:
            # Handle both single string and list of strings
            if isinstance(text, str):
                text = [text]
            
            # Preprocess
            inputs = self.preprocess(text)
            
            if self.method == 'bertbased':
                # BERT predictions
                with torch.no_grad():
                    outputs = self.model(**inputs)
                    probabilities = torch.softmax(outputs.logits, dim=-1)
                    predictions = torch.argmax(probabilities, dim=-1)
                
                predictions = predictions.cpu().numpy()
                probabilities = probabilities.cpu().numpy()
                
            else:
                # Baseline predictions
                predictions = self.baseline_model.predict(inputs)
                probabilities = self.baseline_model.predict_proba(inputs)
            
            # Convert numeric predictions to original labels
            predicted_labels = self.label_encoder.inverse_transform(predictions)
        
            # Ensure consistent casing with training data
            predicted_labels = [label.title() for label in predicted_labels]
            
            if return_probability:
                results = []
                for t, label, prob, probs in zip(text, predicted_labels, 
                                            probabilities.max(axis=1), 
                                            probabilities):
                    result = {
                        'text': t[:200] + '...' if len(t) > 200 else t,
                        'predicted_label': label.title(),  # Ensure consistent casing
                        'confidence': float(prob),
                        'model_type': self.method,
                        'probabilities': {
                            self.label_encoder.inverse_transform([i])[0].title(): float(p)  # Consistent casing
                            for i, p in enumerate(probs)
                        },
                        # ... rest of the result dictionary ...
                    }
                    results.append(result)
                
                return results[0] if len(text) == 1 else results
            
            return predicted_labels[0] if len(text) == 1 else predicted_labels
            
        except Exception as e:
            logger.error(f"Error in prediction: {str(e)}")
            raise

    def predict_old(self, text, return_probability=False):
        """
        Predict using either BERT or baseline model
        Args:
            text: Input text or list of texts
            return_probability: Whether to return probability scores
        Returns:
            Predictions with metadata
        """
        try:
            # Handle both single string and list of strings
            if isinstance(text, str):
                text = [text]
            
            # Preprocess
            inputs = self.preprocess(text)
            
            if self.method == 'bertbased':
                # BERT predictions
                with torch.no_grad():
                    outputs = self.model(**inputs)
                    probabilities = torch.softmax(outputs.logits, dim=-1)
                    predictions = torch.argmax(probabilities, dim=-1)
                
                predictions = predictions.cpu().numpy()
                probabilities = probabilities.cpu().numpy()
                
            else:
                # Baseline predictions
                predictions = self.baseline_model.predict(inputs)
                probabilities = self.baseline_model.predict_proba(inputs)
            
            # Convert numeric predictions to original labels
            predicted_labels = self.label_encoder.inverse_transform(predictions)
            
            if return_probability:
                results = []
                for t, label, prob, probs in zip(text, predicted_labels, 
                                               probabilities.max(axis=1), 
                                               probabilities):
                    # Create detailed result dictionary
                    result = {
                        'text': t[:200] + '...' if len(t) > 200 else t,  # Truncate long text
                        'predicted_label': label,
                        'confidence': float(prob),
                        'model_type': self.method,
                        'probabilities': {
                            self.label_encoder.inverse_transform([i])[0]: float(p)
                            for i, p in enumerate(probs)
                        },
                        'timestamp': datetime.now().isoformat(),
                        'metadata': {
                            'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline',
                            'text_length': len(t),
                            'preprocessing_steps': ['cleaning', 'tokenization']
                        }
                    }
                    results.append(result)
                
                return results[0] if len(text) == 1 else results
            
            return predicted_labels[0] if len(text) == 1 else predicted_labels
            
        except Exception as e:
            logger.error(f"Error in prediction: {str(e)}")
            raise
    
    def get_model_info(self):
        """Return model information"""
        return {
            'model_type': self.method,
            'model_name': 'BERT' if self.method == 'bertbased' else 'Baseline',
            'device': str(self.device) if self.method == 'bertbased' else 'CPU',
            'max_sequence_length': 512 if self.method == 'bertbased' else None,
            'number_of_classes': len(self.label_encoder.classes_),
            'classes': list(self.label_encoder.classes_)
        }

def load_and_process_pdf(url_or_file):
    """
    Load and process PDF from URL or file
    Returns extracted text
    """
    try:
        # Your PDF processing code here
        # Return extracted text
        pass
    except Exception as e:
        logger.error(f"Error processing PDF: {str(e)}")
        raise

# Example usage
if __name__ == "__main__":
    # Test the pipeline
    classifier = TextClassificationPipeline()
    
    # Test single prediction
    text = "Example construction document text"
    result = classifier.predict(text, return_probability=True)
    print("\nSingle Prediction Result:")
    print(result)
    
    # Test batch prediction
    texts = ["First document", "Second document"]
    results = classifier.predict(texts, return_probability=True)
    print("\nBatch Prediction Results:")
    for result in results:
        print(f"\nText: {result['text']}")
        print(f"Prediction: {result['predicted_label']}")
        print(f"Confidence: {result['confidence']:.4f}")